Skip to content

Commit

Permalink
[RPC Gateway Fallback Resign] Implement the online path (#601)
Browse files Browse the repository at this point in the history
Following the design https://www.notion.so/uniswaplabs/Redesign-RPC-Gateway-Fallback-7c2ea784ae554f81a6d488bc52fab5ae?pvs=4 and previous PR for offline path #583, we modify the online path to support auto fallback and recover based on DB states.

A few highlights in this PR
- Previous "health score" based calculation is totally removed. Online path will only publish metrics and listen to DB for provider health state updates
- The auto fallback and recover is able to turn on/off at a per chain basis. This can help us to test it fully, for example, when deployed for a testnet with prod traffic
- All UTs have been updated. All unused util classes has been removed

Tested on local env:
- Verified the offline path to be working: Auto update DB state based on alarm state
- Verified the online path to be working: See a switch of provider selection after the preferred provider's DB state becomes unhealthy. Also see the reverse switch of provider selection when then provider's DB state resumes to healthy
  - This is observed using the new updated dashboards which shows DB state change and `SingleJsonProvider`'s state change history, as well as the number of selection of providers when serving RPC calls.

An example of provider switch, showing switching from Nirvana to Infura when serving RPC requests after Nirvana becomes unhealthy.

![image.png](https://graphite-user-uploaded-assets-prod.s3.amazonaws.com/n7CRFgTfi6wAndDXekPd/bdc8a736-21bf-4a83-a7a3-2a1f27dfc82a.png)

Another example where we let the primary provider fail, then we see the traffic goes to the backup provider, then we let the primary provider recover, then the traffic goes back to the primary provider:

![Screenshot 2024-04-23 at 10.23.01 AM.png](https://graphite-user-uploaded-assets-prod.s3.amazonaws.com/n7CRFgTfi6wAndDXekPd/25359834-65d1-4840-be70-8b70c68e3e56.png)
  • Loading branch information
rollingtumbling authored Apr 23, 2024
1 parent 9d79d1c commit 5a9396d
Show file tree
Hide file tree
Showing 16 changed files with 231 additions and 1,170 deletions.
3 changes: 1 addition & 2 deletions bin/stacks/routing-api-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ export class RoutingAPIStack extends cdk.Stack {
cachedV3PoolsDynamoDb,
cachedV2PairsDynamoDb,
tokenPropertiesCachingDynamoDb,
rpcProviderStateDynamoDb,
rpcProviderHealthStateDynamoDb,
} = new RoutingDatabaseStack(this, 'RoutingDatabaseStack', {})

Expand All @@ -115,7 +114,7 @@ export class RoutingAPIStack extends cdk.Stack {
cachedV3PoolsDynamoDb,
cachedV2PairsDynamoDb,
tokenPropertiesCachingDynamoDb,
rpcProviderStateDynamoDb,
rpcProviderHealthStateDynamoDb,
unicornSecret,
})

Expand Down
6 changes: 3 additions & 3 deletions bin/stacks/routing-database-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export const DynamoDBTableProps = {
Name: 'TokenPropertiesCachingDb',
PartitionKeyName: 'chainIdTokenAddress',
},
// NOTICE: This table is obsolete. Do not touch it.
RpcProviderStateDbTable: {
Name: 'RpcProviderState',
PartitionKeyName: 'chainIdProviderName',
Expand All @@ -64,7 +65,6 @@ export class RoutingDatabaseStack extends cdk.NestedStack {
public readonly cachedV3PoolsDynamoDb: aws_dynamodb.Table
public readonly cachedV2PairsDynamoDb: aws_dynamodb.Table
public readonly tokenPropertiesCachingDynamoDb: aws_dynamodb.Table
public readonly rpcProviderStateDynamoDb: aws_dynamodb.Table
public readonly rpcProviderHealthStateDynamoDb: aws_dynamodb.Table

constructor(scope: Construct, name: string, props: RoutingDatabaseStackProps) {
Expand Down Expand Up @@ -153,8 +153,8 @@ export class RoutingDatabaseStack extends cdk.NestedStack {
}
)

// NOTICE: This table will become useless after we fully migrate to rpcProviderHealthStateDynamoDb
this.rpcProviderStateDynamoDb = new aws_dynamodb.Table(this, DynamoDBTableProps.RpcProviderStateDbTable.Name, {
// NOTICE: This table has become useless after we fully migrate to rpcProviderHealthStateDynamoDb
new aws_dynamodb.Table(this, DynamoDBTableProps.RpcProviderStateDbTable.Name, {
tableName: DynamoDBTableProps.RpcProviderStateDbTable.Name,
partitionKey: {
name: DynamoDBTableProps.RpcProviderStateDbTable.PartitionKeyName,
Expand Down
8 changes: 4 additions & 4 deletions bin/stacks/routing-lambda-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export interface RoutingLambdaStackProps extends cdk.NestedStackProps {
cachedV3PoolsDynamoDb: aws_dynamodb.Table
cachedV2PairsDynamoDb: aws_dynamodb.Table
tokenPropertiesCachingDynamoDb: aws_dynamodb.Table
rpcProviderStateDynamoDb: aws_dynamodb.Table
rpcProviderHealthStateDynamoDb: aws_dynamodb.Table
unicornSecret: string
}
export class RoutingLambdaStack extends cdk.NestedStack {
Expand Down Expand Up @@ -61,7 +61,7 @@ export class RoutingLambdaStack extends cdk.NestedStack {
cachedV3PoolsDynamoDb,
cachedV2PairsDynamoDb,
tokenPropertiesCachingDynamoDb,
rpcProviderStateDynamoDb,
rpcProviderHealthStateDynamoDb,
unicornSecret,
} = props

Expand All @@ -88,7 +88,7 @@ export class RoutingLambdaStack extends cdk.NestedStack {
cachedV3PoolsDynamoDb.grantReadWriteData(lambdaRole)
cachedV2PairsDynamoDb.grantReadWriteData(lambdaRole)
tokenPropertiesCachingDynamoDb.grantReadWriteData(lambdaRole)
rpcProviderStateDynamoDb.grantReadWriteData(lambdaRole)
rpcProviderHealthStateDynamoDb.grantReadWriteData(lambdaRole)

const region = cdk.Stack.of(this).region

Expand Down Expand Up @@ -134,7 +134,7 @@ export class RoutingLambdaStack extends cdk.NestedStack {
CACHING_REQUEST_FLAG_TABLE_NAME: DynamoDBTableProps.CachingRequestFlagDynamoDbTable.Name,
CACHED_V3_POOLS_TABLE_NAME: DynamoDBTableProps.V3PoolsDynamoDbTable.Name,
V2_PAIRS_CACHE_TABLE_NAME: DynamoDBTableProps.V2PairsDynamoCache.Name,
RPC_PROVIDER_HEALTH_TABLE_NAME: DynamoDBTableProps.RpcProviderStateDbTable.Name,
RPC_PROVIDER_HEALTH_TABLE_NAME: DynamoDBTableProps.RpcProviderHealthStateDbTable.Name,

// tokenPropertiesCachingDynamoDb.tableName is the correct format.
// we will start using the correct ones going forward
Expand Down
87 changes: 28 additions & 59 deletions bin/stacks/rpc-gateway-dashboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,45 +38,55 @@ function getSelectMetricsForChain(chainId: ChainId) {
return metrics
}

function getHealthScoreMetricsForChain(chainId: ChainId) {
function getProviderDbHealthStateChangeForChain(chainId: ChainId) {
const metrics = []
for (const providerName of getProviderNameForChain(chainId)) {
const providerId = getProviderId(chainId, providerName)
metrics.push([
'Uniswap',
`RPC_GATEWAY_FALLBACK_${providerId}_INTO_UNHEALTHY`,
'Service',
'RoutingAPI',
{
id: `db_into_unhealthy_${chainId}_${providerName}`,
label: `${providerName} DB into UNHEALTHY ${ID_TO_NETWORK_NAME(chainId)}`,
},
])
metrics.push([
'Uniswap',
`RPC_GATEWAY_${chainId}_${providerName}_health_score`,
`RPC_GATEWAY_FALLBACK_${providerId}_INTO_HEALTHY`,
'Service',
'RoutingAPI',
{
id: `health_score_${chainId}_${providerName}`,
label: `${providerName} health score on ${ID_TO_NETWORK_NAME(chainId)}`,
id: `db_into_healthy_${chainId}_${providerName}`,
label: `${providerName} DB into HEALTHY ${ID_TO_NETWORK_NAME(chainId)}`,
},
])
}
return metrics
}

function getProviderDbHealthStateChangeForChain(chainId: ChainId) {
function getProviderHealthStateChangeForChain(chainId: ChainId) {
const metrics = []
for (const providerName of getProviderNameForChain(chainId)) {
const providerId = getProviderId(chainId, providerName)
metrics.push([
'Uniswap',
`RPC_GATEWAY_FALLBACK_${providerId}_INTO_UNHEALTHY`,
`RPC_GATEWAY_${chainId}_${providerName}_becomes_UNHEALTHY`,
'Service',
'RoutingAPI',
{
id: `db_into_unhealthy_${chainId}_${providerName}`,
label: `${providerName} DB into UNHEALTHY ${ID_TO_NETWORK_NAME(chainId)}`,
id: `provider_into_unhealthy_${chainId}_${providerName}`,
label: `${providerName} into UNHEALTHY ${ID_TO_NETWORK_NAME(chainId)}`,
},
])
metrics.push([
'Uniswap',
`RPC_GATEWAY_FALLBACK_${providerId}_INTO_HEALTHY`,
`RPC_GATEWAY_${chainId}_${providerName}_becomes_HEALTHY`,
'Service',
'RoutingAPI',
{
id: `db_into_healthy_${chainId}_${providerName}`,
label: `${providerName} DB into HEALTHY ${ID_TO_NETWORK_NAME(chainId)}`,
id: `provider_into_healthy_${chainId}_${providerName}`,
label: `${providerName} into HEALTHY ${ID_TO_NETWORK_NAME(chainId)}`,
},
])
}
Expand Down Expand Up @@ -122,26 +132,6 @@ function getSuccessMetricsForChain(chainId: ChainId) {
return metrics
}

function getHighLatencyMetricsForChain(chainId: ChainId) {
const metrics = []
const methodNames = ['call', 'send', 'getGasPrice', 'getBlockNumber']
for (const providerName of getProviderNameForChain(chainId)) {
for (const methodName of methodNames) {
metrics.push([
'Uniswap',
`RPC_GATEWAY_${chainId}_${providerName}_${methodName}_SUCCESS_HIGH_LATENCY`,
'Service',
'RoutingAPI',
{
id: `${methodName}_high_latency_${chainId}_${providerName}`,
label: `${providerName} ${methodName} high latency on ${ID_TO_NETWORK_NAME(chainId)}`,
},
])
}
}
return metrics
}

function getFailedMetricsForChain(chainId: ChainId) {
const metrics = []
const methodNames = ['call', 'send', 'getGasPrice', 'getBlockNumber']
Expand Down Expand Up @@ -607,18 +597,18 @@ export class RpcGatewayDashboardStack extends cdk.NestedStack {
width: 24,
type: 'metric',
properties: {
metrics: getHealthScoreMetricsForChain(chainId),
metrics: getProviderDbHealthStateChangeForChain(chainId),
view: 'timeSeries',
stacked: false,
region,
stat: 'Maximum',
period: 300,
title: `Provider (negative) health score for ${ID_TO_NETWORK_NAME(chainId)}`,
title: `Provider DB health change for ${ID_TO_NETWORK_NAME(chainId)}`,
setPeriodToTimeRange: true,
yAxis: {
left: {
showUnits: false,
label: 'Score (in negative)',
label: 'DB health state changes',
},
},
},
Expand All @@ -628,18 +618,18 @@ export class RpcGatewayDashboardStack extends cdk.NestedStack {
width: 24,
type: 'metric',
properties: {
metrics: getProviderDbHealthStateChangeForChain(chainId),
metrics: getProviderHealthStateChangeForChain(chainId),
view: 'timeSeries',
stacked: false,
region,
stat: 'Maximum',
period: 300,
title: `Provider DB health change for ${ID_TO_NETWORK_NAME(chainId)}`,
title: `Provider health state for ${ID_TO_NETWORK_NAME(chainId)}`,
setPeriodToTimeRange: true,
yAxis: {
left: {
showUnits: false,
label: 'DB health state changes',
label: 'Health state changes',
},
},
},
Expand Down Expand Up @@ -707,27 +697,6 @@ export class RpcGatewayDashboardStack extends cdk.NestedStack {
},
},
},
{
height: 8,
width: 24,
type: 'metric',
properties: {
metrics: getHighLatencyMetricsForChain(chainId),
view: 'timeSeries',
stacked: false,
region,
stat: 'Sum',
period: 300,
title: `Provider high latency occurrence for ${ID_TO_NETWORK_NAME(chainId)}`,
setPeriodToTimeRange: true,
yAxis: {
left: {
showUnits: false,
label: 'Requests',
},
},
},
},
{
height: 8,
width: 24,
Expand Down
4 changes: 2 additions & 2 deletions bin/stacks/rpc-gateway-fallback-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ export class RpcGatewayFallbackStack extends cdk.NestedStack {
metric,
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
// TODO(jie): Resume to a reasonable threshold once we verified the workflow in prod.
threshold: 0.1, // Alarm when error rate >= 0.1%
threshold: 1.0, // Alarm when error rate >= 1.0%
evaluationPeriods: 1,
})

Expand Down Expand Up @@ -127,7 +127,7 @@ export class RpcGatewayFallbackStack extends cdk.NestedStack {
metric,
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
// TODO(jie): Resume to a reasonable threshold once we verified the workflow in prod.
threshold: 100, // Alarm when latency >= 100ms
threshold: 150, // Alarm when latency >= 150ms
evaluationPeriods: 1,
})

Expand Down
16 changes: 0 additions & 16 deletions lib/rpc/ProviderState.ts

This file was deleted.

104 changes: 0 additions & 104 deletions lib/rpc/ProviderStateDynamoDbRepository.ts

This file was deleted.

Loading

0 comments on commit 5a9396d

Please sign in to comment.