temporalio · tdeebswihart · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
@@ -820,6 +820,9 @@ const (
 	ReplicationBypassCorruptedData = "history.ReplicationBypassCorruptedData"
 	// ReplicationEnableDLQMetrics is the flag to emit DLQ metrics
 	ReplicationEnableDLQMetrics = "history.ReplicationEnableDLQMetrics"
+	// HistoryTaskDLQInteralErrors causes history task processing to send tasks failing with serviceerror.Internal to
+	// the dlq (or will drop them if not enabled)
+	HistoryTaskDropInternalErrors = "history.TaskDLQInternalErrors"
 
 	// ReplicationStreamSyncStatusDuration sync replication status duration
 	ReplicationStreamSyncStatusDuration = "history.ReplicationStreamSyncStatusDuration"

@@ -1298,6 +1298,7 @@ var (
 	)
 	TaskNotActiveCounter         = NewCounterDef("task_errors_not_active_counter")
 	TaskNamespaceHandoverCounter = NewCounterDef("task_errors_namespace_handover")
+	TaskInternalErrorCounter     = NewCounterDef("task_errors_internal")
 	TaskThrottledCounter         = NewCounterDef(
 		"task_errors_throttled",
 		WithDescription("The number of history task processing errors caused by resource exhausted errors, excluding workflow busy case."),

@@ -520,6 +520,7 @@ func TestArchivalQueueTaskExecutor(t *testing.T) {
 				mockMetadata,
 				nil,
 				metrics.NoopMetricsHandler,
+				func() bool { return false },
 			)
 			err := executable.Execute()
 			if len(p.ExpectedErrorSubstrings) > 0 {

@@ -97,6 +97,7 @@ type Config struct {
 	QueueCriticalSlicesCount         dynamicconfig.IntPropertyFn
 	QueuePendingTaskMaxCount         dynamicconfig.IntPropertyFn
 	QueueMaxReaderCount              dynamicconfig.IntPropertyFn
+	TaskDropInternalErrors           dynamicconfig.BoolPropertyFn
 
 	TaskSchedulerEnableRateLimiter           dynamicconfig.BoolPropertyFn
 	TaskSchedulerEnableRateLimiterShadowMode dynamicconfig.BoolPropertyFn
@@ -380,6 +381,7 @@ func NewConfig(
 		QueueCriticalSlicesCount:         dc.GetIntProperty(dynamicconfig.QueueCriticalSlicesCount, 50),
 		QueuePendingTaskMaxCount:         dc.GetIntProperty(dynamicconfig.QueuePendingTaskMaxCount, 10000),
 		QueueMaxReaderCount:              dc.GetIntProperty(dynamicconfig.QueueMaxReaderCount, 2),
+		TaskDropInternalErrors:           dc.GetBoolProperty(dynamicconfig.HistoryTaskDropInternalErrors, false),
 
 		TaskSchedulerEnableRateLimiter:           dc.GetBoolProperty(dynamicconfig.TaskSchedulerEnableRateLimiter, false),
 		TaskSchedulerEnableRateLimiterShadowMode: dc.GetBoolProperty(dynamicconfig.TaskSchedulerEnableRateLimiterShadowMode, true),

@@ -41,6 +41,7 @@ import (
 	"go.temporal.io/server/common/backoff"
 	"go.temporal.io/server/common/clock"
 	"go.temporal.io/server/common/cluster"
+	"go.temporal.io/server/common/dynamicconfig"
 	"go.temporal.io/server/common/headers"
 	"go.temporal.io/server/common/log"
 	"go.temporal.io/server/common/log/tag"
@@ -129,6 +130,7 @@ type (
 		lastActiveness         bool
 		resourceExhaustedCount int // does NOT include consts.ErrResourceExhaustedBusyWorkflow
 		taggedMetricsHandler   metrics.Handler
+		dropInternalErrors     dynamicconfig.BoolPropertyFn
 	}
 )
 
@@ -144,7 +146,11 @@ func NewExecutable(
 	clusterMetadata cluster.Metadata,
 	logger log.Logger,
 	metricsHandler metrics.Handler,
+	dropInternalErrors dynamicconfig.BoolPropertyFn,
 ) Executable {
+	if dropInternalErrors == nil {
+		dropInternalErrors = func() bool { return false }
+	}
 	executable := &executableImpl{
 		Task:              task,
 		state:             ctasks.TaskStatePending,
@@ -166,6 +172,7 @@ func NewExecutable(
 		),
 		metricsHandler:       metricsHandler,
 		taggedMetricsHandler: metricsHandler,
+		dropInternalErrors:   dropInternalErrors,
 	}
 	executable.updatePriority()
 	return executable
@@ -341,6 +348,13 @@ func (e *executableImpl) HandleErr(err error) (retErr error) {
 		e.logger.Error("Drop task due to serialization error", tag.Error(err))
 		return nil
 	}
+	if common.IsInternalError(err) {
+		e.logger.Error("Encountered internal error processing tasks", tag.Error(err))
+		e.taggedMetricsHandler.Counter(metrics.TaskInternalErrorCounter.GetMetricName()).Record(1)
+		if e.dropInternalErrors() {
+			return nil
+		}
+	}
 
 	e.taggedMetricsHandler.Counter(metrics.TaskFailures.GetMetricName()).Record(1)
 

@@ -40,6 +40,7 @@ import (
 	"go.temporal.io/server/common/clock"
 	"go.temporal.io/server/common/cluster"
 	"go.temporal.io/server/common/definition"
+	"go.temporal.io/server/common/dynamicconfig"
 	"go.temporal.io/server/common/headers"
 	"go.temporal.io/server/common/log"
 	"go.temporal.io/server/common/metrics"
@@ -65,6 +66,11 @@ type (
 
 		timeSource *clock.EventTimeSource
 	}
+
+	params struct {
+		dropInternalErrors dynamicconfig.BoolPropertyFn
+	}
+	option func(*params)
 )
 
 func TestExecutableSuite(t *testing.T) {
@@ -297,6 +303,34 @@ func (s *executableSuite) TestExecuteHandleErr_Corrupted() {
 	s.NoError(executable.HandleErr(err))
 }
 
+func (s *executableSuite) TestExecute_DropsInternalErrors_WhenEnabled() {
+	executable := s.newTestExecutable(func(p *params) {
+		p.dropInternalErrors = func() bool { return true }
+	})
+
+	s.mockExecutor.EXPECT().Execute(gomock.Any(), executable).DoAndReturn(
+		func(_ context.Context, _ Executable) ([]metrics.Tag, bool, error) {
+			panic(serviceerror.NewInternal("injected error"))
+		},
+	)
+
+	s.NoError(executable.HandleErr(executable.Execute()))
+}
+
+func (s *executableSuite) TestExecute_DoesntDropInternalErrors_WhenDisabled() {
+	executable := s.newTestExecutable(func(p *params) {
+		p.dropInternalErrors = func() bool { return false }
+	})
+
+	s.mockExecutor.EXPECT().Execute(gomock.Any(), executable).DoAndReturn(
+		func(_ context.Context, _ Executable) ([]metrics.Tag, bool, error) {
+			panic(serviceerror.NewInternal("injected error"))
+		},
+	)
+
+	s.Error(executable.HandleErr(executable.Execute()))
+}
+
 func (s *executableSuite) TestHandleErr_EntityNotExists() {
 	executable := s.newTestExecutable()
 
@@ -408,7 +442,13 @@ func (s *executableSuite) TestTaskCancellation() {
 	s.False(executable.IsRetryableError(errors.New("some random error")))
 }
 
-func (s *executableSuite) newTestExecutable() Executable {
+func (s *executableSuite) newTestExecutable(opts ...option) Executable {
+	p := params{
+		dropInternalErrors: func() bool { return false },
+	}
+	for _, opt := range opts {
+		opt(&p)
+	}
 	return NewExecutable(
 		DefaultReaderId,
 		tasks.NewFakeTask(
@@ -429,5 +469,6 @@ func (s *executableSuite) newTestExecutable() Executable {
 		s.mockClusterMetadata,
 		log.NewTestLogger(),
 		metrics.NoopMetricsHandler,
+		p.dropInternalErrors,
 	)
 }
@@ -184,6 +184,7 @@ func (s *memoryScheduledQueueSuite) newSpeculativeWorkflowTaskTimeoutTestExecuta
 			nil,
 			nil,
 			nil,
+			func() bool { return false },
 		),
 		wttt,
 	)

@@ -168,6 +168,7 @@ func newQueueBase(
 			shard.GetClusterMetadata(),
 			logger,
 			metricsHandler,
+			shard.GetConfig().TaskDropInternalErrors,
 		)
 	}
 

@@ -77,7 +77,7 @@ func (s *readerSuite) SetupTest() {
 	s.metricsHandler = metrics.NoopMetricsHandler
 
 	s.executableInitializer = func(readerID int64, t tasks.Task) Executable {
-		return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, nil, metrics.NoopMetricsHandler)
+		return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, nil, metrics.NoopMetricsHandler, func() bool { return false })
 	}
 	s.monitor = newMonitor(tasks.CategoryTypeScheduled, clock.NewRealTimeSource(), &MonitorOptions{
 		PendingTasksCriticalCount:   dynamicconfig.GetIntPropertyFn(1000),

@@ -69,7 +69,7 @@ func (s *sliceSuite) SetupTest() {
 	s.controller = gomock.NewController(s.T())
 
 	s.executableInitializer = func(readerID int64, t tasks.Task) Executable {
-		return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, nil, metrics.NoopMetricsHandler)
+		return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, nil, metrics.NoopMetricsHandler, func() bool { return false })
 	}
 	s.monitor = newMonitor(tasks.CategoryTypeScheduled, clock.NewRealTimeSource(), &MonitorOptions{
 		PendingTasksCriticalCount:   dynamicconfig.GetIntPropertyFn(1000),

@@ -106,6 +106,7 @@ func (q SpeculativeWorkflowTaskTimeoutQueue) NotifyNewTasks(ts []tasks.Task) {
 				q.clusterMetadata,
 				q.logger,
 				q.metricsHandler,
+				func() bool { return false },
 			), wttt)
 			q.timeoutQueue.Add(executable)
 		}

@@ -1610,5 +1610,6 @@ func (s *timerQueueActiveTaskExecutorSuite) newTaskExecutable(
 		s.mockClusterMetadata,
 		nil,
 		metrics.NoopMetricsHandler,
+		func() bool { return false },
 	)
 }
@@ -1505,5 +1505,6 @@ func (s *timerQueueStandbyTaskExecutorSuite) newTaskExecutable(
 		s.mockClusterMetadata,
 		nil,
 		metrics.NoopMetricsHandler,
+		func() bool { return false },
 	)
 }
@@ -2805,5 +2805,6 @@ func (s *transferQueueActiveTaskExecutorSuite) newTaskExecutable(
 		s.mockClusterMetadata,
 		nil,
 		metrics.NoopMetricsHandler,
+		func() bool { return false },
 	)
 }
@@ -1269,5 +1269,6 @@ func (s *transferQueueStandbyTaskExecutorSuite) newTaskExecutable(
 		s.mockClusterMetadata,
 		nil,
 		metrics.NoopMetricsHandler,
+		func() bool { return false },
 	)
 }
@@ -617,5 +617,6 @@ func (s *visibilityQueueTaskExecutorSuite) newTaskExecutable(
 		s.mockShard.GetClusterMetadata(),
 		nil,
 		metrics.NoopMetricsHandler,
+		func() bool { return false },
 	)
 }
-Original file line number
+Diff line change
@@ Expand Up @@
     			nil,
     			nil,
     			nil,
+    			func() bool { return false },
     		),
     		wttt,
     	)
@@ Expand Down @@