Skip to content

Commit

Permalink
Fix runaway runner deletion on scale-down when API quota is hit (#938)
Browse files Browse the repository at this point in the history
Rethrow the octokit 'API rate limit exceeded' errors when fetching
runner info on `scale-down` instead of consuming it.

Please see SEV  pytorch/pytorch#87500 for details.

Note: I've decided on the least invasive implementation (re-throwing
only very specific class of exceptions) to avoid unintended side
effects. Need to discuss with @jeanschmidt if all exceptions could be
safely rethrown.

Testing:
* Unit tests
  • Loading branch information
izaitsevfb authored Oct 21, 2022
1 parent f6c460d commit 4412592
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.vscode
.venv
.idea

*.env
*.zip
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"@octokit/auth-app": "^3.0.0",
"@octokit/rest": "^18.3.5",
"@octokit/types": "^6.12.2",
"@octokit/request-error": "^2.1.0",
"@types/aws-lambda": "^8.10.72",
"@types/express": "^4.17.11",
"@types/node": "^14.14.34",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import {
scaleDown,
sortRunnersByLaunchTime,
} from './scale-down';
import { RequestError } from '@octokit/request-error';

jest.mock('./gh-runners', () => ({
/* eslint-disable-next-line @typescript-eslint/no-explicit-any */
Expand Down Expand Up @@ -1246,5 +1247,39 @@ describe('scale-down', () => {
expect(mockedGetRunnerOrg).toBeCalledTimes(1);
expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics);
});

it('getRunner throws when api rate limit is hit', async () => {
const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg);

mockedListGithubRunnersOrg.mockRejectedValueOnce(
new RequestError('API rate limit exceeded for installation ID 13954108.', 403, {
headers: {
'access-control-allow-origin': '*',
'x-ratelimit-limit': '20000',
'x-ratelimit-remaining': '0',
'x-ratelimit-reset': '1666378232',
'x-ratelimit-resource': 'core',
'x-ratelimit-used': '20006',
'x-xss-protection': '0',
},
request: {
method: 'GET',
url: 'https://api.github.com/orgs/pytorch/actions/runners?per_page=100',
headers: {
accept: 'application/vnd.github.v3+json',
},
},
}),
);

const ec2runner: RunnerInfo = {
org: org,
instanceId: 'instance-id-03',
runnerType: 'runnerType-01',
ghRunnerId: 'ghRunnerId-01',
};

await expect(getGHRunnerOrg(ec2runner, metrics)).rejects.toThrow(RequestError);
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
import { ScaleDownMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './metrics';
import { listRunners, resetRunnersCaches, terminateRunner } from './runners';
import { getRepo, groupBy, Repo, RunnerInfo } from './utils';
import { RequestError } from '@octokit/request-error';

export async function scaleDown(): Promise<void> {
const metrics = new ScaleDownMetrics();
Expand Down Expand Up @@ -174,6 +175,18 @@ export async function scaleDown(): Promise<void> {
}
}

/**
* Returns true if the argument `e` is an octokit RequestError
* from the 'API rate limit exceeded' exception.
* @param e any error type
* @return true if the argument is 'API rate limit exceeded' exception, false otherwise.
*/
function isRateLimitError(e: unknown) {
const requestErr = e as RequestError | null;
const headers = requestErr?.headers || requestErr?.response?.headers;
return requestErr?.status === 403 && headers?.['x-ratelimit-remaining'] === '0';
}

export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMetrics): Promise<GhRunner | undefined> {
const org = ec2runner.org as string;
let ghRunner: GhRunner | undefined = undefined;
Expand All @@ -183,6 +196,9 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe
ghRunner = ghRunners.find((runner) => runner.name === ec2runner.instanceId);
} catch (e) {
console.warn('Failed to list active gh runners', e);
if (isRateLimitError(e)) {
throw e;
}
}

if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) {
Expand All @@ -197,6 +213,9 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when ` +
`listGithubRunnersOrg call: ${e}`,
);
if (isRateLimitError(e)) {
throw e;
}
}
}
if (ghRunner) {
Expand All @@ -220,6 +239,9 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM
ghRunner = ghRunners.find((runner) => runner.name === ec2runner.instanceId);
} catch (e) {
console.warn('Failed to list active gh runners', e);
if (isRateLimitError(e)) {
throw e;
}
}

if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) {
Expand All @@ -233,6 +255,9 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM
console.warn(
`Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) error when getRunnerRepo call: ${e}`,
);
if (isRateLimitError(e)) {
throw e;
}
}
}
if (ghRunner !== undefined) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,15 @@
deprecation "^2.0.0"
once "^1.4.0"

"@octokit/request-error@^2.1.0":
version "2.1.0"
resolved "https://registry.yarnpkg.com/@octokit/request-error/-/request-error-2.1.0.tgz#9e150357831bfc788d13a4fd4b1913d60c74d677"
integrity sha512-1VIvgXxs9WHSjicsRwq8PlR2LR2x6DwsJAaFgzdi0JfJoGSO8mYI/cHJQ+9FbN21aa+DrgNLnwObmyeSC8Rmpg==
dependencies:
"@octokit/types" "^6.0.3"
deprecation "^2.0.0"
once "^1.4.0"

"@octokit/request@^5.3.0", "@octokit/request@^5.4.11", "@octokit/request@^5.4.12":
version "5.4.12"
resolved "https://registry.yarnpkg.com/@octokit/request/-/request-5.4.12.tgz#b04826fa934670c56b135a81447be2c1723a2ffc"
Expand Down

0 comments on commit 4412592

Please sign in to comment.