Skip to content

Commit

Permalink
Arc 2713 use retry after header in jira client (#2590)
Browse files Browse the repository at this point in the history
* ARC-2713 add retry on 429 for jira client
  • Loading branch information
gxueatlassian authored Dec 5, 2023
1 parent 1ec2d17 commit 20d2467
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 31 deletions.
3 changes: 2 additions & 1 deletion src/config/feature-flags.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ export enum BooleanFlags {
ENABLE_5KU_BACKFILL_PAGE = "enable-5ku-experience-backfill-page",
USE_DYNAMODB_TO_PERSIST_AUDIT_LOG = "use-dynamodb-to-persist-audit-log",
USE_CUSTOM_ROOT_CA_BUNDLE = "use-custom-root-ca-bundle",
GENERATE_CORE_HEAP_DUMPS_ON_LOW_MEM = "generate-core-heap-dumps-on-low-mem"
GENERATE_CORE_HEAP_DUMPS_ON_LOW_MEM = "generate-core-heap-dumps-on-low-mem",
USE_RATELIMIT_ON_JIRA_CLIENT = "use-ratelimit-on-jira-client"
}

export enum StringFlags {
Expand Down
1 change: 1 addition & 0 deletions src/config/metric-names.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export const sqsQueueMetrics = {
received: `${server}.sqs.queue.received`,
completed: `${server}.sqs.queue.success`,
failed: `${server}.sqs.queue.failed`,
exception: `${server}.sqs.queue.exception`,
sent: `${server}.sqs.queue.sent`,
deleted: `${server}.sqs.queue.deleted`,
duration: `${server}.sqs.queue.duration`
Expand Down
17 changes: 17 additions & 0 deletions src/jira/client/axios.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,21 @@ describe("Jira axios instance", () => {
expect(error?.message).toEqual("Error executing Axios Request HTTP 404 - Bad REST path, or Jira instance not found, renamed or temporarily suspended.");
});

describe("when having a rate limited", () => {
it("should extract the retry after header if present", async () => {
const requestPayload = "TestRequestPayload";
jiraNock.post("/foo/bar", requestPayload)
.reply(429, "", {
"Retry-After": "100"
});

await expect(getAxiosInstance(jiraHost, "secret", getLogger("test")).post("/foo/bar", requestPayload))
.rejects.toMatchObject({
status: 429,
retryAfterInSeconds: 100
});

});
});

});
33 changes: 32 additions & 1 deletion src/jira/client/axios.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,30 @@ export class JiraClientError extends Error {
status?: number;
cause: AxiosError;

constructor(message: string, cause: AxiosError, status?: number) {
constructor(
message: string,
cause: AxiosError,
status: number | undefined
) {
super(message);
this.status = status;
this.cause = cause;
}
}

export class JiraClientRateLimitingError extends JiraClientError {
retryAfterInSeconds: number | undefined;
constructor(
message: string,
cause: AxiosError,
status: number | undefined,
retryAfterInSeconds: number | undefined
) {
super(message, cause, status);
this.retryAfterInSeconds = retryAfterInSeconds;
}
}

export const getJiraErrorMessages = (status: number) => {
switch (status) {
case 400:
Expand Down Expand Up @@ -86,9 +103,23 @@ const getErrorMiddleware = (logger: Logger) =>
logger.error({ err: error, res: error?.response }, errorMessage);
}

if (error.response?.status === 429) {
return Promise.reject(new JiraClientRateLimitingError(errorMessage, error, status, getRetryAfterInSec(error)));
}

return Promise.reject(new JiraClientError(errorMessage, error, status));
};

const getRetryAfterInSec = (error: AxiosError): number | undefined => {

const retryAfterInSecondsStr = error.response?.headers["retry-after"];
const retryAfterInSeconds = parseInt(retryAfterInSecondsStr || "unknown");

if (isNaN(retryAfterInSeconds)) return undefined;

return retryAfterInSeconds;
};

/**
* Middleware to enhance successful requests in Jira.
*
Expand Down
14 changes: 7 additions & 7 deletions src/sqs/backfill-error-handler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ describe("backfillErrorHandler", () => {
status: 403
};
const result = await backfillErrorHandler(jest.fn())(abuseDetectionError, createContext(2, false));
expect(result).toEqual({
expect(result).toMatchObject({
isFailure: true,
retryDelaySec: 540,
retryable: true
Expand All @@ -150,7 +150,7 @@ describe("backfillErrorHandler", () => {
}

const result = await backfillErrorHandler(jest.fn())(new TaskError(task, sequelizeConnectionError), createContext(2, false));
expect(result).toEqual({
expect(result).toMatchObject({
isFailure: true,
retryDelaySec: 540,
retryable: true
Expand All @@ -162,7 +162,7 @@ describe("backfillErrorHandler", () => {
new TaskError(task, (await create500FromGitHub())!),
createContext(2, false)
);
expect(result).toEqual({
expect(result).toMatchObject({
isFailure: true,
retryDelaySec: 540,
retryable: true
Expand All @@ -174,7 +174,7 @@ describe("backfillErrorHandler", () => {
new TaskError(task, (await create500FromJira())!),
createContext(2, false)
);
expect(result).toEqual({
expect(result).toMatchObject({
isFailure: true,
retryDelaySec: 540,
retryable: true
Expand All @@ -187,7 +187,7 @@ describe("backfillErrorHandler", () => {
createContext(5, true)
);

expect(result).toEqual({
expect(result).toMatchObject({
isFailure: false
});
const mockMessage = sendMessageMock.mock.calls[0] as any[];
Expand All @@ -204,7 +204,7 @@ describe("backfillErrorHandler", () => {
createContext(5, true)
);

expect(result).toEqual({
expect(result).toMatchObject({
isFailure: false
});
const mockMessage = sendMessageMock.mock.calls[0] as any[];
Expand Down Expand Up @@ -254,7 +254,7 @@ describe("backfillErrorHandler", () => {
createContext(3, false)
);

expect(result).toEqual({
expect(result).toMatchObject({
isFailure: false
});
const mockMessage = sendMessageMock.mock.calls[0] as any[];
Expand Down
22 changes: 17 additions & 5 deletions src/sqs/backfill-error-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {
import { booleanFlag, BooleanFlags } from "config/feature-flags";

const handleTaskError = async (sendSQSBackfillMessage: (message, delaySec, logger) => Promise<SQS.SendMessageResult>, task: Task, cause: Error, context: SQSMessageContext<BackfillMessagePayload>, rootLogger: Logger
) => {
): Promise<ErrorHandlingResult> => {
const log = rootLogger.child({
task,
receiveCount: context.receiveCount,
Expand All @@ -32,7 +32,10 @@ const handleTaskError = async (sendSQSBackfillMessage: (message, delaySec, logge
: log.warn("InvalidPermissionError: marking the task as failed and continue with the next one");
await markCurrentTaskAsFailedAndContinue(context.payload, task, true, sendSQSBackfillMessage, log, cause);
return {
isFailure: false
isFailure: false,
statusCode: cause.status,
source: "github",
errorName: cause.constructor.name
};
}

Expand All @@ -50,7 +53,10 @@ const handleTaskError = async (sendSQSBackfillMessage: (message, delaySec, logge
await sendSQSBackfillMessage(context.payload, 0, log);
}
return {
isFailure: false
isFailure: false,
statusCode: cause.status,
source: "github",
errorName: cause.constructor.name
};
}

Expand All @@ -59,7 +65,10 @@ const handleTaskError = async (sendSQSBackfillMessage: (message, delaySec, logge
: log.info("Repo was deleted, marking the task as completed");
await updateTaskStatusAndContinue(context.payload, { edges: [] }, task, log, sendSQSBackfillMessage);
return {
isFailure: false
isFailure: false,
statusCode: cause.status,
source: "github",
errorName: cause.constructor.name
};
}

Expand All @@ -69,7 +78,10 @@ const handleTaskError = async (sendSQSBackfillMessage: (message, delaySec, logge
: log.warn("That was the last attempt: marking the task as failed and continue with the next one");
await markCurrentTaskAsFailedAndContinue(context.payload, task, false, sendSQSBackfillMessage, log, cause);
return {
isFailure: false
isFailure: false,
statusCode: parseInt(String(cause["status"])),
source: "github",
errorName: cause.constructor.name
};
}

Expand Down
66 changes: 65 additions & 1 deletion src/sqs/error-handlers.test.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import { statsd } from "config/statsd";
import { jiraAndGitHubErrorsHandler, webhookMetricWrapper } from "./error-handlers";
import { getLogger } from "config/logger";
import { JiraClientError } from "../jira/client/axios";
import { JiraClientError, JiraClientRateLimitingError } from "../jira/client/axios";
import { Octokit } from "@octokit/rest";
import { GithubClientRateLimitingError } from "../github/client/github-client-errors";
import { AxiosError, AxiosResponse, AxiosResponseHeaders } from "axios";
import { ErrorHandlingResult, SQSMessageContext, BaseMessagePayload } from "~/src/sqs/sqs.types";
import { booleanFlag, BooleanFlags } from "config/feature-flags";
import { when } from "jest-when";

jest.mock("config/feature-flags");

describe("error-handlers", () => {

Expand Down Expand Up @@ -117,6 +121,66 @@ describe("error-handlers", () => {
expect(result.isFailure).toBe(true);
});

it("Retryable with proper delay on Rate Limiting - when ff is on", async () => {

when(booleanFlag).calledWith(BooleanFlags.USE_RATELIMIT_ON_JIRA_CLIENT, expect.anything())
.mockResolvedValue(true);

const headers: AxiosResponseHeaders = { "retry-after": "1000" };
const mockedResponse = { status: 403, headers: headers } as AxiosResponse;

const result = await jiraAndGitHubErrorsHandler(
new JiraClientRateLimitingError("test rate limit error", {
response: mockedResponse
} as AxiosError, 429, 1000),
createContext(1, false)
);

expect(result.retryable).toBe(true);
expect(result.retryDelaySec).toBe(1000);
expect(result.isFailure).toBe(true);
});

it("Retryable with origin delay on Rate Limiting but have undefined retry after header - when ff is on", async () => {

when(booleanFlag).calledWith(BooleanFlags.USE_RATELIMIT_ON_JIRA_CLIENT, expect.anything())
.mockResolvedValue(true);

const headers: AxiosResponseHeaders = { };
const mockedResponse = { status: 403, headers: headers } as AxiosResponse;

const result = await jiraAndGitHubErrorsHandler(
new JiraClientRateLimitingError("test rate limit error", {
response: mockedResponse
} as AxiosError, 429, undefined),
createContext(1, false)
);

expect(result.retryable).toBe(true);
expect(result.retryDelaySec).toBe(180);
expect(result.isFailure).toBe(true);
});

it("Retryable with origin delay on Rate Limiting - when ff is off", async () => {

when(booleanFlag).calledWith(BooleanFlags.USE_RATELIMIT_ON_JIRA_CLIENT, expect.anything())
.mockResolvedValue(false);

const headers: AxiosResponseHeaders = { "retry-after": "1000" };
const mockedResponse = { status: 403, headers: headers } as AxiosResponse;

const result = await jiraAndGitHubErrorsHandler(
new JiraClientRateLimitingError("test rate limit error", {
response: mockedResponse
} as AxiosError, 429, 1000),
createContext(1, false)
);

expect(result.retryable).toBe(true);
expect(result.retryDelaySec).toBe(180);
expect(result.isFailure).toBe(true);
});

it("Retryable with proper delay on Rate Limiting - receiveCount 3", async () => {
const headers: AxiosResponseHeaders = { "x-ratelimit-reset": `${Math.floor(new Date("2020-01-01").getTime() / 1000) + 100}` };
const mockedResponse = { status: 403, headers: headers } as AxiosResponse;
Expand Down
39 changes: 30 additions & 9 deletions src/sqs/error-handlers.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import { JiraClientError } from "../jira/client/axios";
import { JiraClientError, JiraClientRateLimitingError } from "../jira/client/axios";
import { emitWebhookFailedMetrics } from "utils/webhook-utils";
import { ErrorHandler, ErrorHandlingResult, SQSMessageContext, BaseMessagePayload } from "./sqs.types";
import { GithubClientRateLimitingError } from "../github/client/github-client-errors";
import { GithubClientError, GithubClientRateLimitingError } from "../github/client/github-client-errors";
import { booleanFlag, BooleanFlags } from "config/feature-flags";

/**
* Sometimes we can get errors from Jira and GitHub which does not indicate a failured webhook. For example:
Expand All @@ -25,7 +26,14 @@ export const handleUnknownError: ErrorHandler<BaseMessagePayload> = <MessagePayl
): Promise<ErrorHandlingResult> => {
const delaySec = EXPONENTIAL_BACKOFF_BASE_SEC * Math.pow(EXPONENTIAL_BACKOFF_MULTIPLIER, context.receiveCount);
context.log.warn({ err, delaySec }, "Unknown error: retrying with exponential backoff");
return Promise.resolve({ retryable: true, retryDelaySec: delaySec, isFailure: true });
return Promise.resolve({
retryable: true,
retryDelaySec: delaySec,
isFailure: true,
statusCode: parseInt(String(err["status"])),
source: err instanceof GithubClientError ? "github" : (err instanceof JiraClientError ? "jira" : "other"),
errorName: err.constructor.name
});
};

export const jiraAndGitHubErrorsHandler: ErrorHandler<BaseMessagePayload> = async <MessagePayload extends BaseMessagePayload> (error: Error,
Expand All @@ -34,7 +42,7 @@ export const jiraAndGitHubErrorsHandler: ErrorHandler<BaseMessagePayload> = asyn
context.log.warn({ err: error }, "Handling Jira or GitHub error");

const maybeResult = maybeHandleNonFailureCase(error, context)
|| maybeHandleRateLimitingError(error, context)
|| await maybeHandleRateLimitingError(error, context)
|| maybeHandleNonRetryableResponseCode(error, context);

if (maybeResult) {
Expand Down Expand Up @@ -67,7 +75,7 @@ const maybeHandleNonFailureCase = <MessagePayload extends BaseMessagePayload>(er
error.status &&
UNRETRYABLE_STATUS_CODES.includes(error.status)) {
context.log.warn(`Received ${error.status} from Jira. Unretryable. Discarding the message`);
return { retryable: false, isFailure: false };
return { retryable: false, isFailure: false, statusCode: error.status, source: "jira", errorName: error.constructor.name };
}

return undefined;
Expand All @@ -81,14 +89,19 @@ const maybeHandleNonRetryableResponseCode = <MessagePayload extends BaseMessageP
const status: number | undefined = error["status"] as number | undefined;
if (status && UNRETRYABLE_STATUS_CODES.includes(status)) {
context.log.warn({ err: error }, `Received error with ${status} status. Unretryable. Discarding the message`);
return { retryable: false, isFailure: false };
return {
retryable: false,
isFailure: false,
statusCode: status,
source: error instanceof GithubClientError ? "github" : (error instanceof JiraClientError ? "jira" : "other"),
errorName: error.constructor.name
};
}
return undefined;
};

const maybeHandleRateLimitingError = <MessagePayload extends BaseMessagePayload>(error: Error, context: SQSMessageContext<MessagePayload>): ErrorHandlingResult | undefined => {
const maybeHandleRateLimitingError = async <MessagePayload extends BaseMessagePayload>(error: Error, context: SQSMessageContext<MessagePayload>): Promise<ErrorHandlingResult | undefined> => {
if (error instanceof GithubClientRateLimitingError) {
context.log.warn({ error }, `Rate limiting error, retrying`);
// A stepped buffer to prioritize messages with a higher received count to get replayed slightly sooner than newer messages
// e.g. a receiveCount 5 message will be visible 50 seconds sooner than a receiveCount 1 message
const buffer = Math.max(RATE_LIMITING_BUFFER_STEP, BASE_RATE_LIMITING_DELAY_BUFFER_SEC - context.receiveCount * RATE_LIMITING_BUFFER_STEP);
Expand All @@ -97,7 +110,15 @@ const maybeHandleRateLimitingError = <MessagePayload extends BaseMessagePayload>
// GitHub Rate limit resets hourly, in the scenario of burst traffic it continues to overwhelm the rate limit
// this attempts to ease the load across multiple refreshes
const retryDelaySec = rateLimitReset + ONE_HOUR_IN_SECONDS * (context.receiveCount - 1);
return { retryable: true, retryDelaySec, isFailure: true };
context.log.warn({ error, source: "github", retryDelaySec }, `Rate limiting error, retrying`);
return { retryable: true, retryDelaySec, isFailure: true, source: "github", statusCode: error.status, errorName: error.constructor.name };
}

if (await booleanFlag(BooleanFlags.USE_RATELIMIT_ON_JIRA_CLIENT, context.payload.jiraHost)) {
if (error instanceof JiraClientRateLimitingError && error.retryAfterInSeconds !== undefined) {
context.log.warn({ error, source: "jira", retryDelaySec: error.retryAfterInSeconds }, `Rate limiting error, retrying`);
return { retryable: true, retryDelaySec: error.retryAfterInSeconds, isFailure: true, source: "jira", statusCode: error.status, errorName: error.constructor.name };
}
}

return undefined;
Expand Down
Loading

0 comments on commit 20d2467

Please sign in to comment.