Fix runaway runner deletion on scale-down when API quota is hit (#938)

Rethrow the octokit 'API rate limit exceeded' errors when fetching runner info on `scale-down` instead of consuming it. Please see SEV pytorch/pytorch#87500 for details. Note: I've decided on the least invasive implementation (re-throwing only very specific class of exceptions) to avoid unintended side effects. Need to discuss with @jeanschmidt if all exceptions could be safely rethrown. Testing: * Unit tests
pytorch · Oct 21, 2022 · 4412592 · 4412592
1 parent f6c460d
commit 4412592
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .vscode
 .venv
+.idea
 
 *.env
 *.zip

diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json
@@ -34,6 +34,7 @@
     "@octokit/auth-app": "^3.0.0",
     "@octokit/rest": "^18.3.5",
     "@octokit/types": "^6.12.2",
+    "@octokit/request-error": "^2.1.0",
     "@types/aws-lambda": "^8.10.72",
     "@types/express": "^4.17.11",
     "@types/node": "^14.14.34",

diff --git a/...rm-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts b/...rm-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts
@@ -27,6 +27,7 @@ import {
   scaleDown,
   sortRunnersByLaunchTime,
 } from './scale-down';
+import { RequestError } from '@octokit/request-error';
 
 jest.mock('./gh-runners', () => ({
   /* eslint-disable-next-line @typescript-eslint/no-explicit-any */
@@ -1246,5 +1247,39 @@ describe('scale-down', () => {
       expect(mockedGetRunnerOrg).toBeCalledTimes(1);
       expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics);
     });
+
+    it('getRunner throws when api rate limit is hit', async () => {
+      const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg);
+
+      mockedListGithubRunnersOrg.mockRejectedValueOnce(
+        new RequestError('API rate limit exceeded for installation ID 13954108.', 403, {
+          headers: {
+            'access-control-allow-origin': '*',
+            'x-ratelimit-limit': '20000',
+            'x-ratelimit-remaining': '0',
+            'x-ratelimit-reset': '1666378232',
+            'x-ratelimit-resource': 'core',
+            'x-ratelimit-used': '20006',
+            'x-xss-protection': '0',
+          },
+          request: {
+            method: 'GET',
+            url: 'https://api.github.com/orgs/pytorch/actions/runners?per_page=100',
+            headers: {
+              accept: 'application/vnd.github.v3+json',
+            },
+          },
+        }),
+      );
+
+      const ec2runner: RunnerInfo = {
+        org: org,
+        instanceId: 'instance-id-03',
+        runnerType: 'runnerType-01',
+        ghRunnerId: 'ghRunnerId-01',
+      };
+
+      await expect(getGHRunnerOrg(ec2runner, metrics)).rejects.toThrow(RequestError);
+    });
   });
 });
diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts
@@ -15,6 +15,7 @@ import {
 import { ScaleDownMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './metrics';
 import { listRunners, resetRunnersCaches, terminateRunner } from './runners';
 import { getRepo, groupBy, Repo, RunnerInfo } from './utils';
+import { RequestError } from '@octokit/request-error';
 
 export async function scaleDown(): Promise<void> {
   const metrics = new ScaleDownMetrics();
@@ -174,6 +175,18 @@ export async function scaleDown(): Promise<void> {
   }
 }
 
+/**
+ * Returns true if the argument `e` is an octokit RequestError
+ * from the 'API rate limit exceeded' exception.
+ * @param e any error type
+ * @return true if the argument is 'API rate limit exceeded' exception, false otherwise.
+ */
+function isRateLimitError(e: unknown) {
+  const requestErr = e as RequestError | null;
+  const headers = requestErr?.headers || requestErr?.response?.headers;
+  return requestErr?.status === 403 && headers?.['x-ratelimit-remaining'] === '0';
+}
+
 export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMetrics): Promise<GhRunner | undefined> {
   const org = ec2runner.org as string;
   let ghRunner: GhRunner | undefined = undefined;
@@ -183,6 +196,9 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe
     ghRunner = ghRunners.find((runner) => runner.name === ec2runner.instanceId);
   } catch (e) {
     console.warn('Failed to list active gh runners', e);
+    if (isRateLimitError(e)) {
+      throw e;
+    }
   }
 
   if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) {
@@ -197,6 +213,9 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe
         `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when ` +
           `listGithubRunnersOrg call: ${e}`,
       );
+      if (isRateLimitError(e)) {
+        throw e;
+      }
     }
   }
   if (ghRunner) {
@@ -220,6 +239,9 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM
     ghRunner = ghRunners.find((runner) => runner.name === ec2runner.instanceId);
   } catch (e) {
     console.warn('Failed to list active gh runners', e);
+    if (isRateLimitError(e)) {
+      throw e;
+    }
   }
 
   if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) {
@@ -233,6 +255,9 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM
       console.warn(
         `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) error when getRunnerRepo call: ${e}`,
       );
+      if (isRateLimitError(e)) {
+        throw e;
+      }
     }
   }
   if (ghRunner !== undefined) {

diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock
@@ -600,6 +600,15 @@
     deprecation "^2.0.0"
     once "^1.4.0"
 
+"@octokit/request-error@^2.1.0":
+  version "2.1.0"
+  resolved "https://registry.yarnpkg.com/@octokit/request-error/-/request-error-2.1.0.tgz#9e150357831bfc788d13a4fd4b1913d60c74d677"
+  integrity sha512-1VIvgXxs9WHSjicsRwq8PlR2LR2x6DwsJAaFgzdi0JfJoGSO8mYI/cHJQ+9FbN21aa+DrgNLnwObmyeSC8Rmpg==
+  dependencies:
+    "@octokit/types" "^6.0.3"
+    deprecation "^2.0.0"
+    once "^1.4.0"
+
 "@octokit/request@^5.3.0", "@octokit/request@^5.4.11", "@octokit/request@^5.4.12":
   version "5.4.12"
   resolved "https://registry.yarnpkg.com/@octokit/request/-/request-5.4.12.tgz#b04826fa934670c56b135a81447be2c1723a2ffc"