1+ import { setTimeout } from 'node:timers/promises' ;
2+
13import { MIN_SUPPORTED_SNAPSHOT_READS_WIRE_VERSION } from '../cmap/wire_protocol/constants' ;
24import {
35 isRetryableReadError ,
@@ -10,6 +12,7 @@ import {
1012 MongoInvalidArgumentError ,
1113 MongoNetworkError ,
1214 MongoNotConnectedError ,
15+ MongoOperationTimeoutError ,
1316 MongoRuntimeError ,
1417 MongoServerError ,
1518 MongoTransactionError ,
@@ -26,9 +29,15 @@ import {
2629import type { Topology } from '../sdam/topology' ;
2730import type { ClientSession } from '../sessions' ;
2831import { TimeoutContext } from '../timeout' ;
29- import { abortable , maxWireVersion , supportsRetryableWrites } from '../utils' ;
32+ import { RETRY_COST , TOKEN_REFRESH_RATE } from '../token_bucket' ;
33+ import {
34+ abortable ,
35+ ExponentialBackoffProvider ,
36+ maxWireVersion ,
37+ supportsRetryableWrites
38+ } from '../utils' ;
3039import { AggregateOperation } from './aggregate' ;
31- import { AbstractOperation , Aspect } from './operation' ;
40+ import { AbstractOperation , Aspect , RetryContext } from './operation' ;
3241
3342const MMAPv1_RETRY_WRITES_ERROR_CODE = MONGODB_ERROR_CODES . IllegalOperation ;
3443const MMAPv1_RETRY_WRITES_ERROR_MESSAGE =
@@ -50,7 +59,7 @@ type ResultTypeFromOperation<TOperation extends AbstractOperation> = ReturnType<
5059 * The expectation is that this function:
5160 * - Connects the MongoClient if it has not already been connected, see {@link autoConnect}
5261 * - Creates a session if none is provided and cleans up the session it creates
53- * - Tries an operation and retries under certain conditions, see {@link tryOperation }
62+ * - Tries an operation and retries under certain conditions, see {@link executeOperationWithRetries }
5463 *
5564 * @typeParam T - The operation's type
5665 * @typeParam TResult - The type of the operation's result, calculated from T
@@ -120,7 +129,7 @@ export async function executeOperation<
120129 } ) ;
121130
122131 try {
123- return await tryOperation ( operation , {
132+ return await executeOperationWithRetries ( operation , {
124133 topology,
125134 timeoutContext,
126135 session,
@@ -184,7 +193,10 @@ type RetryOptions = {
184193 *
185194 * @param operation - The operation to execute
186195 * */
187- async function tryOperation < T extends AbstractOperation , TResult = ResultTypeFromOperation < T > > (
196+ async function executeOperationWithRetries <
197+ T extends AbstractOperation ,
198+ TResult = ResultTypeFromOperation < T >
199+ > (
188200 operation : T ,
189201 { topology, timeoutContext, session, readPreference } : RetryOptions
190202) : Promise < TResult > {
@@ -233,11 +245,27 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
233245 session . incrementTransactionNumber ( ) ;
234246 }
235247
236- const maxTries = willRetry ? ( timeoutContext . csotEnabled ( ) ? Infinity : 2 ) : 1 ;
237248 let previousOperationError : MongoError | undefined ;
238249 const deprioritizedServers = new DeprioritizedServers ( ) ;
239250
240- for ( let tries = 0 ; tries < maxTries ; tries ++ ) {
251+ const backoffDelayProvider = new ExponentialBackoffProvider (
252+ 10_000 , // MAX_BACKOFF
253+ 100 , // base backoff
254+ 2 // backoff rate
255+ ) ;
256+
257+ const retryContext =
258+ operation . retryContext ??
259+ new RetryContext ( willRetry ? ( timeoutContext . csotEnabled ( ) ? Infinity : 2 ) : 1 ) ;
260+ for (
261+ let attempt = 0 ;
262+ attempt < retryContext . maxAttempts ;
263+ attempt ++ ,
264+ retryContext . maxAttempts =
265+ willRetry && previousOperationError ?. hasErrorLabel ( MongoErrorLabel . SystemOverloadedError )
266+ ? 6
267+ : retryContext . maxAttempts
268+ ) {
241269 if ( previousOperationError ) {
242270 if ( hasWriteAspect && previousOperationError . code === MMAPv1_RETRY_WRITES_ERROR_CODE ) {
243271 throw new MongoServerError ( {
@@ -247,15 +275,39 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
247275 } ) ;
248276 }
249277
250- if ( operation . hasAspect ( Aspect . COMMAND_BATCHING ) && ! operation . canRetryWrite ) {
278+ const isRetryable =
279+ // bulk write commands are retryable if all operations in the batch are retryable
280+ ( operation . hasAspect ( Aspect . COMMAND_BATCHING ) && operation . canRetryWrite ) ||
281+ // if we have a retryable read or write operation, we can retry
282+ ( hasWriteAspect && willRetryWrite && isRetryableWriteError ( previousOperationError ) ) ||
283+ ( hasReadAspect && willRetryRead && isRetryableReadError ( previousOperationError ) ) ||
284+ // if we have a retryable, system overloaded error, we can retry
285+ ( previousOperationError . hasErrorLabel ( MongoErrorLabel . SystemOverloadedError ) &&
286+ previousOperationError . hasErrorLabel ( MongoErrorLabel . RetryableError ) ) ;
287+
288+ if ( ! isRetryable ) {
251289 throw previousOperationError ;
252290 }
253291
254- if ( hasWriteAspect && ! isRetryableWriteError ( previousOperationError ) )
255- throw previousOperationError ;
256-
257- if ( hasReadAspect && ! isRetryableReadError ( previousOperationError ) ) {
258- throw previousOperationError ;
292+ if ( previousOperationError . hasErrorLabel ( MongoErrorLabel . SystemOverloadedError ) ) {
293+ const delayMS = backoffDelayProvider . getNextBackoffDuration ( ) ;
294+
295+ // if the delay would exhaust the CSOT timeout, short-circuit.
296+ if ( timeoutContext . csotEnabled ( ) && delayMS > timeoutContext . remainingTimeMS ) {
297+ // TODO: is this the right error to throw?
298+ throw new MongoOperationTimeoutError (
299+ `MongoDB SystemOverload exponential backoff would exceed timeoutMS deadline: remaining CSOT deadline=${ timeoutContext . remainingTimeMS } , backoff delayMS=${ delayMS } ` ,
300+ {
301+ cause : previousOperationError
302+ }
303+ ) ;
304+ }
305+
306+ if ( ! topology . tokenBucket . consume ( RETRY_COST ) ) {
307+ throw previousOperationError ;
308+ }
309+
310+ await setTimeout ( delayMS ) ;
259311 }
260312
261313 if (
@@ -285,19 +337,34 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
285337 operation . server = server ;
286338
287339 try {
288- // If tries > 0 and we are command batching we need to reset the batch.
289- if ( tries > 0 && operation . hasAspect ( Aspect . COMMAND_BATCHING ) ) {
340+ const isRetry = attempt > 0 ;
341+
342+ // If attempt > 0 and we are command batching we need to reset the batch.
343+ if ( isRetry && operation . hasAspect ( Aspect . COMMAND_BATCHING ) ) {
290344 operation . resetBatch ( ) ;
291345 }
292346
293347 try {
294348 const result = await server . command ( operation , timeoutContext ) ;
349+ topology . tokenBucket . deposit (
350+ isRetry
351+ ? // on successful retry, deposit the retry cost + the refresh rate.
352+ TOKEN_REFRESH_RATE + RETRY_COST
353+ : // otherwise, just deposit the refresh rate.
354+ TOKEN_REFRESH_RATE
355+ ) ;
295356 return operation . handleOk ( result ) ;
296357 } catch ( error ) {
297358 return operation . handleError ( error ) ;
298359 }
299360 } catch ( operationError ) {
300361 if ( ! ( operationError instanceof MongoError ) ) throw operationError ;
362+
363+ if ( ! operationError . hasErrorLabel ( MongoErrorLabel . SystemOverloadedError ) ) {
364+ // if an operation fails with an error that does not contain the SystemOverloadError, deposit 1 token.
365+ topology . tokenBucket . deposit ( RETRY_COST ) ;
366+ }
367+
301368 if (
302369 previousOperationError != null &&
303370 operationError . hasErrorLabel ( MongoErrorLabel . NoWritesPerformed )
@@ -312,8 +379,5 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
312379 }
313380 }
314381
315- throw (
316- previousOperationError ??
317- new MongoRuntimeError ( 'Tried to propagate retryability error, but no error was found.' )
318- ) ;
382+ throw previousOperationError ?? new MongoRuntimeError ( 'ahh' ) ;
319383}
0 commit comments