From a2f76a7cf0ca7a0af5b10905a29042585deab3b7 Mon Sep 17 00:00:00 2001 From: seaona Date: Mon, 23 Feb 2026 09:53:38 +0100 Subject: [PATCH] poc flaky analyzer extension --- .../actions/analyze-flaky-tests/action.yml | 79 +++ .github/actions/flaky-test-report/action.yml | 27 +- .github/scripts/analyze-flaky-tests/index.ts | 278 ++++++++++ .../knowledge/extension-flakiness-patterns.md | 502 ++++++++++++++++++ .../llm/claude-analyzer.ts | 136 +++++ .../analyze-flaky-tests/llm/prompt-builder.ts | 50 ++ .../scripts/analyze-flaky-tests/llm/tools.ts | 284 ++++++++++ .github/scripts/analyze-flaky-tests/types.ts | 45 ++ .../utils/job-log-fetcher.ts | 140 +++++ .../utils/knowledge-base.ts | 71 +++ .../utils/past-fixes-fetcher.ts | 123 +++++ .../utils/slack-reporter.ts | 137 +++++ .../utils/test-source-reader.ts | 124 +++++ .github/scripts/create-flaky-test-report.mjs | 79 ++- .github/workflows/flaky-test-ai-analysis.yml | 70 +++ package.json | 2 + yarn.lock | 89 ++++ 17 files changed, 2223 insertions(+), 13 deletions(-) create mode 100644 .github/actions/analyze-flaky-tests/action.yml create mode 100644 .github/scripts/analyze-flaky-tests/index.ts create mode 100644 .github/scripts/analyze-flaky-tests/knowledge/extension-flakiness-patterns.md create mode 100644 .github/scripts/analyze-flaky-tests/llm/claude-analyzer.ts create mode 100644 .github/scripts/analyze-flaky-tests/llm/prompt-builder.ts create mode 100644 .github/scripts/analyze-flaky-tests/llm/tools.ts create mode 100644 .github/scripts/analyze-flaky-tests/types.ts create mode 100644 .github/scripts/analyze-flaky-tests/utils/job-log-fetcher.ts create mode 100644 .github/scripts/analyze-flaky-tests/utils/knowledge-base.ts create mode 100644 .github/scripts/analyze-flaky-tests/utils/past-fixes-fetcher.ts create mode 100644 .github/scripts/analyze-flaky-tests/utils/slack-reporter.ts create mode 100644 .github/scripts/analyze-flaky-tests/utils/test-source-reader.ts create mode 100644 .github/workflows/flaky-test-ai-analysis.yml diff --git a/.github/actions/analyze-flaky-tests/action.yml b/.github/actions/analyze-flaky-tests/action.yml new file mode 100644 index 00000000..61e957d6 --- /dev/null +++ b/.github/actions/analyze-flaky-tests/action.yml @@ -0,0 +1,79 @@ +name: Analyze Flaky Tests with AI +description: 'Analyzes flaky test failures using Claude AI and posts findings as Slack thread replies.' + +inputs: + github-token: + description: 'GitHub token with repo and actions:read access' + required: true + claude-api-key: + description: 'Anthropic API key for Claude' + required: true + slack-bot-token: + description: 'Slack Bot Token with chat:write scope' + required: true + slack-channel-id: + description: 'Slack channel ID to post findings to' + required: true + slack-thread-ts: + description: 'Slack thread timestamp to reply to (from the flaky test report)' + required: true + failures-json: + description: 'JSON array of test failures to analyze (from the flaky test report)' + required: true + target-owner: + description: 'Owner of the repository containing the tests' + required: false + default: 'MetaMask' + target-repo: + description: 'Repository containing the tests' + required: false + default: 'metamask-extension' + github-tools-repository: + description: 'The GitHub repository containing the GitHub tools.' + required: false + default: ${{ github.action_repository }} + github-tools-ref: + description: 'The SHA of the action to use.' + required: false + default: ${{ github.action_ref }} + +runs: + using: composite + steps: + - name: Checkout GitHub tools repository + uses: actions/checkout@v6 + with: + repository: ${{ inputs.github-tools-repository }} + ref: ${{ inputs.github-tools-ref }} + path: ./github-tools + + - name: Set up Node.js + uses: actions/setup-node@v6 + with: + node-version-file: ./github-tools/.nvmrc + cache-dependency-path: ./github-tools/yarn.lock + cache: yarn + + - name: Enable Corepack + working-directory: ./github-tools + shell: bash + run: corepack enable + + - name: Install dependencies + working-directory: ./github-tools + shell: bash + run: yarn --immutable + + - name: Run AI analysis + env: + GITHUB_TOKEN: ${{ inputs.github-token }} + E2E_CLAUDE_API_KEY: ${{ inputs.claude-api-key }} + SLACK_BOT_TOKEN: ${{ inputs.slack-bot-token }} + SLACK_CHANNEL_ID: ${{ inputs.slack-channel-id }} + SLACK_THREAD_TS: ${{ inputs.slack-thread-ts }} + TARGET_OWNER: ${{ inputs.target-owner }} + TARGET_REPO: ${{ inputs.target-repo }} + FAILURES_JSON: ${{ inputs.failures-json }} + working-directory: ./github-tools + shell: bash + run: yarn ts-node --swc .github/scripts/analyze-flaky-tests/index.ts diff --git a/.github/actions/flaky-test-report/action.yml b/.github/actions/flaky-test-report/action.yml index b907583a..d9dac57a 100644 --- a/.github/actions/flaky-test-report/action.yml +++ b/.github/actions/flaky-test-report/action.yml @@ -12,8 +12,17 @@ inputs: description: 'GitHub token with repo and actions:read access' required: true slack-webhook-flaky-tests: - description: 'Slack webhook URL for flaky test reports' - required: true + description: 'Slack webhook URL for flaky test reports (legacy fallback)' + required: false + default: '' + slack-bot-token: + description: 'Slack Bot Token with chat:write scope (preferred over webhook for thread support)' + required: false + default: '' + slack-channel-id: + description: 'Slack channel ID to post the report to (required when using slack-bot-token)' + required: false + default: '' github-tools-repository: description: 'The GitHub repository containing the GitHub tools. Defaults to the GitHub tools action repositor, and usually does not need to be changed.' required: false @@ -23,6 +32,17 @@ inputs: required: false default: ${{ github.action_ref }} +outputs: + thread_ts: + description: 'Slack thread timestamp of the posted report (only available when using slack-bot-token)' + value: ${{ steps.report.outputs.thread_ts }} + has_failures: + description: 'Whether any test failures were found (true/false)' + value: ${{ steps.report.outputs.has_failures }} + failures_json: + description: 'JSON array of the top 10 test failures for downstream analysis' + value: ${{ steps.report.outputs.failures_json }} + runs: using: composite steps: @@ -51,11 +71,14 @@ runs: run: yarn --immutable - name: Run flaky test report script + id: report env: REPOSITORY: ${{ inputs.repository }} WORKFLOW_ID: ${{ inputs.workflow-id }} GITHUB_TOKEN: ${{ inputs.github-token }} SLACK_WEBHOOK_FLAKY_TESTS: ${{ inputs.slack-webhook-flaky-tests }} + SLACK_BOT_TOKEN: ${{ inputs.slack-bot-token }} + SLACK_CHANNEL_ID: ${{ inputs.slack-channel-id }} working-directory: ./github-tools shell: bash run: node .github/scripts/create-flaky-test-report.mjs diff --git a/.github/scripts/analyze-flaky-tests/index.ts b/.github/scripts/analyze-flaky-tests/index.ts new file mode 100644 index 00000000..5029f0a6 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/index.ts @@ -0,0 +1,278 @@ +import fs from 'fs'; +import { Octokit } from '@octokit/rest'; +import type { AnalysisResult, FlakyTestFailure, SlackFinding } from './types'; +import { fetchJobLog } from './utils/job-log-fetcher'; +import { postSlackFindings } from './utils/slack-reporter'; +import { buildInitialPrompt } from './llm/prompt-builder'; +import { analyzeWithClaude } from './llm/claude-analyzer'; +import { executeToolCall } from './llm/tools'; +import type { ToolContext } from './llm/tools'; +import { + getKnowledgeSection, + listKnowledgeSections, +} from './utils/knowledge-base'; + +interface Config { + githubToken: string; + claudeApiKey: string; + slackBotToken: string; + slackChannelId: string; + slackThreadTs: string; + targetOwner: string; + targetRepo: string; + failuresJson: string; + dryRun: boolean; +} + +function loadConfig(): Config { + const mockLlm = process.argv.includes('--mock-llm'); + const requiredVars = mockLlm ? ['GITHUB_TOKEN'] : ['GITHUB_TOKEN', 'E2E_CLAUDE_API_KEY']; + + for (const envVar of requiredVars) { + if (!process.env[envVar]) { + throw new Error(`Missing required environment variable: ${envVar}`); + } + } + + const dryRun = process.argv.includes('--dry-run'); + + return { + githubToken: process.env.GITHUB_TOKEN!, + claudeApiKey: process.env.E2E_CLAUDE_API_KEY ?? 'mock', + slackBotToken: process.env.SLACK_BOT_TOKEN ?? '', + slackChannelId: process.env.SLACK_CHANNEL_ID ?? '', + slackThreadTs: process.env.SLACK_THREAD_TS ?? '', + targetOwner: process.env.TARGET_OWNER ?? 'MetaMask', + targetRepo: process.env.TARGET_REPO ?? 'metamask-extension', + failuresJson: process.env.FAILURES_JSON ?? '', + dryRun, + }; +} + +function parseFailures(failuresJson: string): FlakyTestFailure[] { + if (!failuresJson) { + throw new Error( + 'FAILURES_JSON is empty. Pass it as an env var or use --fixtures-file .', + ); + } + return JSON.parse(failuresJson) as FlakyTestFailure[]; +} + +function loadFailuresFromArgs(): string { + const fixturesIdx = process.argv.indexOf('--fixtures-file'); + if (fixturesIdx !== -1 && process.argv[fixturesIdx + 1]) { + const filePath = process.argv[fixturesIdx + 1]!; + console.log(`Loading failures from file: ${filePath}\n`); + return fs.readFileSync(filePath, 'utf-8'); + } + return process.env.FAILURES_JSON ?? ''; +} + +async function mockAnalysis( + failure: FlakyTestFailure, + logSection: string, + toolContext: ToolContext, +): Promise { + console.log(' [mock-llm] Simulating tool calls that Claude would make...'); + + // 1. Try fetching the test file + console.log(` [tool] fetch_file({"path":"${failure.path}"})`); + const testContent = await executeToolCall('fetch_file', { path: failure.path }, toolContext); + let resolvedPath = failure.path; + let fetched: string; + + if (testContent.startsWith('File not found')) { + console.log(' [tool] fetch_file => NOT FOUND'); + + // 2. Search for the correct file path + const testNameKeyword = failure.name.split(' ')[0]!.toLowerCase(); + console.log(` [tool] search_test_file({"query":"${testNameKeyword}"})`); + const searchResult = await executeToolCall('search_test_file', { query: testNameKeyword }, toolContext); + console.log(` [tool] search_test_file => ${searchResult.split('\n').length - 1} results`); + + const firstMatch = searchResult.match(/^- (.+\.spec\.\w+)$/m); + if (firstMatch?.[1]) { + resolvedPath = firstMatch[1]; + console.log(` [tool] fetch_file({"path":"${resolvedPath}"})`); + const retryContent = await executeToolCall('fetch_file', { path: resolvedPath }, toolContext); + fetched = retryContent.startsWith('File not found') ? 'NOT FOUND' : `${retryContent.length} chars`; + console.log(` [tool] fetch_file => ${fetched}`); + } else { + fetched = 'NOT FOUND (search also returned no .spec files)'; + } + } else { + fetched = `${testContent.length} chars`; + console.log(` [tool] fetch_file => ${fetched}`); + } + + // 3. Try fetching job logs via runId + if (failure.runId) { + console.log(` [tool] fetch_job_logs({"run_id":${failure.runId}})`); + const jobsResult = await executeToolCall('fetch_job_logs', { run_id: failure.runId }, toolContext); + const jobCount = (jobsResult.match(/^- Job /gm) ?? []).length; + console.log(` [tool] fetch_job_logs => ${jobCount} e2e jobs found`); + } + + // 4. Knowledge base lookup + console.log(' [tool] list_flakiness_categories({})'); + const categories = listKnowledgeSections(); + console.log(` [tool] list_flakiness_categories => ${categories.length} sections`); + + const errorLower = failure.lastError.toLowerCase(); + let matchedCategory = 'other'; + if (errorLower.includes('stale')) matchedCategory = 'stale_reference'; + else if (errorLower.includes('timeout')) matchedCategory = 'timing'; + else if (errorLower.includes('click intercepted')) matchedCategory = 'element_state'; + else if (errorLower.includes('no such window')) matchedCategory = 'window_race'; + + const knowledgeQuery = matchedCategory === 'stale_reference' ? 'React Re-renders' + : matchedCategory === 'timing' ? 'Actions that Take Time' + : matchedCategory === 'window_race' ? 'Race Conditions with Windows' + : 'Anti-Patterns'; + + console.log(` [tool] get_flakiness_patterns({"category":"${knowledgeQuery}"})`); + const section = getKnowledgeSection(knowledgeQuery); + console.log(` [tool] get_flakiness_patterns => ${section.length} chars`); + + // 5. Search for similar fixes + const fixKeyword = failure.name.split(' ').slice(0, 3).join(' '); + console.log(` [tool] search_similar_fixes({"query":"${fixKeyword}"})`); + const fixes = await executeToolCall( + 'search_similar_fixes', + { query: fixKeyword }, + toolContext, + ); + console.log(` [tool] search_similar_fixes => ${fixes.length} chars`); + + console.log(' [tool] submit_analysis({...})'); + + return { + testName: failure.name, + testPath: resolvedPath, + classification: 'flaky_test', + confidence: 75, + rootCauseCategory: matchedCategory, + rootCauseExplanation: `[MOCK] Based on error "${failure.lastError.substring(0, 80)}...", this appears to be a ${matchedCategory} issue. The test file was ${fetched}. Found ${categories.length} knowledge base sections and ${fixes.length} chars of similar fix data.`, + specificLines: ['[MOCK] Line analysis requires real Claude API'], + suggestedFix: '[MOCK] Fix suggestion requires real Claude API. Run without --mock-llm to get actual analysis.', + additionalNotes: `[MOCK] CI log section: ${logSection.substring(0, 100)}...`, + }; +} + +async function main(): Promise { + const config = loadConfig(); + const octokit = new Octokit({ auth: config.githubToken }); + const mockLlm = process.argv.includes('--mock-llm'); + + console.log('=== Flaky Test AI Analyzer (Tool-Augmented Agent) ===\n'); + + if (mockLlm) { + console.log('MOCK LLM MODE: Using mock Claude responses. Tools will execute for real.\n'); + } + if (config.dryRun) { + console.log('DRY RUN MODE: Results will be printed to stdout, not posted to Slack.\n'); + } + + const failuresSource = loadFailuresFromArgs() || config.failuresJson; + const failures = parseFailures(failuresSource); + console.log(`Analyzing ${failures.length} test failures...\n`); + + const toolContext: ToolContext = { + octokit, + owner: config.targetOwner, + repo: config.targetRepo, + }; + + const findings: SlackFinding[] = []; + + for (let i = 0; i < failures.length; i++) { + const failure = failures[i]!; + console.log( + `[${i + 1}/${failures.length}] Analyzing: ${failure.name}`, + ); + + try { + console.log(' Fetching job log...'); + const logSection = await fetchJobLog( + octokit, + failure, + config.targetOwner, + config.targetRepo, + ); + + let analysis: AnalysisResult; + if (mockLlm) { + analysis = await mockAnalysis(failure, logSection, toolContext); + } else { + console.log(' Starting agentic analysis with Claude...'); + const prompt = buildInitialPrompt(failure, logSection, config.targetOwner, config.targetRepo); + analysis = await analyzeWithClaude( + prompt, + failure, + config.claudeApiKey, + toolContext, + ); + } + + const jobUrl = failure.jobId && failure.runId + ? `https://github.com/${config.targetOwner}/${config.targetRepo}/actions/runs/${failure.runId}/job/${failure.jobId}` + : ''; + const fileUrl = `https://github.com/${config.targetOwner}/${config.targetRepo}/blob/main/${failure.path}`; + + findings.push({ failure, analysis, jobUrl, fileUrl }); + + console.log(` Result: ${analysis.classification} (${analysis.confidence}% confidence)`); + console.log(` Root cause: ${analysis.rootCauseCategory}\n`); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + console.error(` Failed to analyze: ${message}\n`); + } + } + + console.log(`\n=== Analysis complete: ${findings.length}/${failures.length} tests analyzed ===\n`); + + if (config.dryRun) { + for (const finding of findings) { + console.log('---'); + console.log(`Test: ${finding.failure.name}`); + console.log(`File: ${finding.failure.path}`); + console.log(`Classification: ${finding.analysis.classification}`); + console.log(`Confidence: ${finding.analysis.confidence}%`); + console.log(`Root Cause: ${finding.analysis.rootCauseCategory}`); + console.log(`Explanation: ${finding.analysis.rootCauseExplanation}`); + if (finding.analysis.specificLines.length > 0) { + console.log(`Problematic Lines:\n ${finding.analysis.specificLines.join('\n ')}`); + } + console.log(`Suggested Fix: ${finding.analysis.suggestedFix}`); + if (finding.analysis.additionalNotes) { + console.log(`Notes: ${finding.analysis.additionalNotes}`); + } + console.log(`Job: ${finding.jobUrl}`); + console.log(`File: ${finding.fileUrl}`); + console.log(''); + } + return; + } + + if (!config.slackBotToken || !config.slackChannelId || !config.slackThreadTs) { + console.log( + 'Slack credentials or thread_ts not provided. Skipping Slack posting.', + ); + console.log('Set SLACK_BOT_TOKEN, SLACK_CHANNEL_ID, and SLACK_THREAD_TS to enable.'); + return; + } + + console.log('Posting findings to Slack thread...'); + await postSlackFindings( + findings, + config.slackThreadTs, + config.slackBotToken, + config.slackChannelId, + ); + console.log('Done!'); +} + +main().catch((error: unknown) => { + console.error('\nFatal error:', error); + process.exit(1); +}); diff --git a/.github/scripts/analyze-flaky-tests/knowledge/extension-flakiness-patterns.md b/.github/scripts/analyze-flaky-tests/knowledge/extension-flakiness-patterns.md new file mode 100644 index 00000000..cb9a9ed4 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/knowledge/extension-flakiness-patterns.md @@ -0,0 +1,502 @@ +# Extension CI Flakiness + +> **Source:** [Extension CI Flakiness - Google Doc](https://docs.google.com/document/d/1oXd5d1X7j14lHLjaRCWjEh3uhndrXQ_46lBuZ9SAu6M/edit?tab=t.0) + +--- + +## Table of Contents + +- [E2E Flakiness Categories](#e2e-flakiness-categories) + - [Race Conditions on Driver/Helpers Functions](#race-conditions-on-driverhelpers-functions) + - [Taking Unnecessary Steps](#taking-unnecessary-steps) + - [Missing or Incorrect Use of Mocks](#missing-or-incorrect-use-of-mocks) + - [Removing URL/host entries to the live server allowlist](#removing-urlhost-entries-to-the-live-server-allowlist) + - [Race Conditions on Gas / Balance / Navigation values on Screen](#race-conditions-on-gas--balance--navigation-values-on-screen) + - [Confirmation Popups / Modals](#confirmation-popups--modals) + - [Incorrect Testing Conditions](#incorrect-testing-conditions) + - [Race Conditions with Assertions within the Test Body Steps](#race-conditions-with-assertions-within-the-test-body-steps) + - [Race Conditions with Windows](#race-conditions-with-windows) + - [Race Conditions with React Re-renders](#race-conditions-with-react-re-renders) + - [Actions that Take Time](#actions-that-take-time) + - [Errors in the testing dapp](#errors-in-the-testing-dapp) + - [Not using driver methods](#not-using-driver-methods) +- [Bugs Discovered on the Wallet Level while Investigating Flaky Tests](#bugs-discovered-on-the-wallet-level-while-investigating-flaky-tests) +- [E2E Anti-Patterns](#e2e-anti-patterns) +- [Unit Test Flakiness Categories](#unit-test-flakiness-categories) +- [Flakiness on Other CI Jobs](#flakiness-on-other-ci-jobs) + +--- + +## E2E Flaky Tests Walkthrough + +- **First session** — [Recording](https://consensys.zoom.us/) (Passcode: `h2&gExZE`) +- **Second session** — [Recording](https://consensys.zoom.us/) (Passcode: `M+^1Tr9Y`) + +--- + +## E2E Flakiness Categories + +### Race Conditions on Driver/Helpers Functions + +- **Click Element with stale element** + [MetaMask/metamask-extension#24813](https://github.com/MetaMask/metamask-extension/pull/24813) + +- **Waiting to the correct window handle number** + [MetaMask/metamask-extension#24741](https://github.com/MetaMask/metamask-extension/pull/24741) + +- **Get window title undefined** + [MetaMask/metamask-extension#24642](https://github.com/MetaMask/metamask-extension/issues/24642) + +- **Click parent Element with inner elements that refresh instead of the most possible specific element** + [MetaMask/metamask-extension#24652](https://github.com/MetaMask/metamask-extension/issues/24652) + +- **Holding SRP button for less time than required** + [MetaMask/metamask-extension#25328](https://github.com/MetaMask/metamask-extension/pull/25328) + +- **Trezor e2e: race condition getting multiple elements with the same selector and then expecting to have the exact number** + [Slack thread](https://consensys.slack.com/archives/C1L7H42BT/p1721043228543209?thread_ts=1720108424.349269&cid=C1L7H42BT) + Fix: [commit 130794d](https://github.com/MetaMask/metamask-extension/pull/25824/commits/130794d18e5ae39887d70e161636b3ec6f4f8164) + +--- + +### Taking Unnecessary Steps + +- **Create token, approve token, missing permission controller connected to the test dapp** + [MetaMask/metamask-extension#24937](https://github.com/MetaMask/metamask-extension/pull/24937) + +- **Migrate Opensea missing permission controller connected to the test dapp, smart contract deployed on the background** + [MetaMask/metamask-extension#24739](https://github.com/MetaMask/metamask-extension/pull/24739) + +- **Request Queue SwitchChain missing smart contract deployed on the background** + [MetaMask/metamask-extension#24674](https://github.com/MetaMask/metamask-extension/pull/24674) + +- **Unnecessary browser refresh, causing to land into the Confirmation screen if it was appearing in the activity as unapproved** + [MetaMask/metamask-extension#24809](https://github.com/MetaMask/metamask-extension/pull/24809) + +- **Unnecessary scrolls, and delays which added up more than 15 seconds of delay** + [MetaMask/metamask-extension#25288](https://github.com/MetaMask/metamask-extension/issues/25288) + +- **Unnecessary step on enabling the nonce going to Settings, instead of using preferenceController fixtures** + [commit 9a72e16](https://github.com/MetaMask/metamask-extension/pull/25687/commits/9a72e166e668fb8a0eba642365b4d62924545ec0) + +- **Unnecessary step of deploying a contract when it's already deployed and loaded in the test dapp param** + [commit 9a72e16](https://github.com/MetaMask/metamask-extension/pull/25687/commits/9a72e166e668fb8a0eba642365b4d62924545ec0) + +- **Unnecessary steps importing a token instead of using fixtures (had to modify the chainId as the token was imported to chainId 1 using ganache)** + [MetaMask/metamask-extension#26654](https://github.com/MetaMask/metamask-extension/pull/26654) + +- **Switching to Mainnet before starting a test for Import tokens — can use fixtures to start the wallet in Mainnet network** + [MetaMask/metamask-extension#27567](https://github.com/MetaMask/metamask-extension/pull/27567) + +- **Unnecessary steps by deploying manually 3 token contracts instead of just pre deploying using the anvil seeder** + [MetaMask/metamask-extension#35664](https://github.com/MetaMask/metamask-extension/pull/35664) + +- **Unnecessary steps for switching network when already in the network I want** + [MetaMask/metamask-extension#37374](https://github.com/MetaMask/metamask-extension/pull/37374) + +--- + +### Missing or Incorrect Use of Mocks + +- **Missing IPFS metadata mock for Import ERC1155** + [MetaMask/metamask-extension#24709](https://github.com/MetaMask/metamask-extension/pull/24709) + +- **Missing mocks for ENS resolution** + [MetaMask/metamask-extension#24898](https://github.com/MetaMask/metamask-extension/pull/24898) + +- **Missing aggregatorMetadata, block list and include blocked tokens mocks** + [MetaMask-planning#2637](https://github.com/MetaMask/MetaMask-planning/issues/2637) + +- **Missing mock for quotes Swap test** + [MetaMask/metamask-extension#27160](https://github.com/MetaMask/metamask-extension/pull/27160) + +- **Inconsistency between the mocked value and the default value: test execution success depends on the polling rate** + [MetaMask/metamask-extension#23520](https://github.com/MetaMask/metamask-extension/pull/23520) + +- **Mocking eth_balance with a value >0ETH causes request polling for subsequent accounts, creating new ones and preventing other requests. Mock balance 0 to avoid this when using Mainnet** + [MetaMask/metamask-extension#25525](https://github.com/MetaMask/metamask-extension/pull/25525) + +- **Incorrect mock request by passing an id, makes the body never match, so the mock response is not implemented** + [MetaMask/metamask-extension#27156](https://github.com/MetaMask/metamask-extension/pull/27156) + +- **Solana missing mock to api.simplehash.com broke CI when that request changed its response, causing a subsequent call to another external API not in the privacy snapshot** + [MetaMask/metamask-extension#29986](https://github.com/MetaMask/metamask-extension/pull/29986) + +- **Add transaction simulation supported networks global mock** + [MetaMask/metamask-extension#30507](https://github.com/MetaMask/metamask-extension/pull/30507) + +- **Blockaid API was not correctly mocked (chainId used was int instead of hex), causing Blockaid validation to fail and metrics event assertion values to fail** + [MetaMask/metamask-extension#30769](https://github.com/MetaMask/metamask-extension/pull/30769) + +- **The default mock for Solana was over-riding the custom mock, causing the balance to be different if the test was slow enough** + [MetaMask/metamask-extension#30808](https://github.com/MetaMask/metamask-extension/pull/30808) + +- **Missing mock caused Smart Transactions + Swap specs to fail** + [MetaMask/metamask-extension#30932](https://github.com/MetaMask/metamask-extension/pull/30932) + +- **Missing mock for Swaps notifications slippage tests** + [MetaMask/metamask-extension#31383](https://github.com/MetaMask/metamask-extension/pull/31383) + +- **Missing mock for Solana devnet** + [MetaMask/metamask-extension#31331](https://github.com/MetaMask/metamask-extension/pull/31331) + +- **Missing mock on onboarding privacy** + [MetaMask/metamask-extension#31272](https://github.com/MetaMask/metamask-extension/pull/31272) + +- **Missing user storage mocks** + [MetaMask/metamask-extension#31947](https://github.com/MetaMask/metamask-extension/pull/31947) + +- **Missing mock for custom network during onboarding** + [MetaMask/metamask-extension#32932](https://github.com/MetaMask/metamask-extension/pull/32932) + +- **Missing the token list mock** + [MetaMask/metamask-extension#34834](https://github.com/MetaMask/metamask-extension/pull/34834) + +--- + +### Removing URL/host entries to the live server allowlist + +- **Part 1:** [MetaMask/metamask-extension#33267](https://github.com/MetaMask/metamask-extension/pull/33267) +- **Part 2:** [MetaMask/metamask-extension#33302](https://github.com/MetaMask/metamask-extension/pull/33302) + +--- + +### Race Conditions on Gas / Balance / Navigation values on Screen + +- **Balance not loaded when starting the Send, causing gas to be 0 and blocking the Confirmation screen** + [MetaMask/metamask-extension#24639](https://github.com/MetaMask/metamask-extension/pull/24639) + Same issue: [#34128](https://github.com/MetaMask/metamask-extension/pull/34128), [#34854](https://github.com/MetaMask/metamask-extension/pull/34854) + +- **Mismatch in gas calculation values, when changing the increase token allowance amount** + [MetaMask/metamask-extension#24734](https://github.com/MetaMask/metamask-extension/pull/24734) + +- **Active network data (isActive, EIP1559..) was not loaded in state when running the assertion** + [MetaMask/metamask-extension#25137](https://github.com/MetaMask/metamask-extension/pull/25137) + +- **Gas is not recalculated before clicking Continue, when switching assets in the Send flow** + [MetaMask/metamask-extension#25181](https://github.com/MetaMask/metamask-extension/issues/25181) + +- **Transaction didn't have the total value loaded before we click reject** + [MetaMask/metamask-extension#25312](https://github.com/MetaMask/metamask-extension/pull/25312) + +- **Spec was not waiting for queued signatures to display navigation, making some signatures not queue properly. Need to wait for the navigation numbers to appear before queueing a new signature** + [MetaMask/metamask-extension#27481](https://github.com/MetaMask/metamask-extension/pull/27481) + +--- + +### Confirmation Popups / Modals + +- **Snaps confirmation popup appears in confirmation screen** + [MetaMask/metamask-extension#24939](https://github.com/MetaMask/metamask-extension/pull/24939) + +- **Vault decryption confirmation popup appears in settings** + [MetaMask/metamask-extension#24830](https://github.com/MetaMask/metamask-extension/pull/24830) + +- **"Got it" element taking time to disappear obfuscates other elements** + [MetaMask/metamask-extension#24580](https://github.com/MetaMask/metamask-extension/pull/24580) + +- **Add account popup obfuscates clicking on the next element from the Home page** + [MetaMask/metamask-extension#25861](https://github.com/MetaMask/metamask-extension/pull/25861) + +- **Import NFT modal obfuscates clicking on the Account menu** + [MetaMask/metamask-extension#27006](https://github.com/MetaMask/metamask-extension/pull/27006) + +- **On the onboarding carousel, not waiting for the element to disappear when switching between screens causes race conditions** + [MetaMask/metamask-extension#27858](https://github.com/MetaMask/metamask-extension/pull/27858) + +- **On the Add token flow, should wait until the dialog has been closed before proceeding — otherwise re-render with React failures** + [MetaMask/metamask-extension#27853](https://github.com/MetaMask/metamask-extension/pull/27853) + +- **On Queued Confirmations tests, connected manually to the test dapp and didn't wait for the MM dialog to close after connect. Caused chainId to be incorrectly outdated** + [MetaMask/metamask-extension#30028](https://github.com/MetaMask/metamask-extension/pull/30028) + +- **The notification (red dot) appears on top of the menu, blocking clicks on the menu button** + [MetaMask/metamask-extension#33492](https://github.com/MetaMask/metamask-extension/pull/33492) + +- **When changing language, sometimes the dropdown menu remains open, causing the next click to have no effect** + [MetaMask/metamask-extension#34169](https://github.com/MetaMask/metamask-extension/pull/34169) + +--- + +### Incorrect Testing Conditions + +- **Testing background in MV3 builds, where there is no background but service worker instead** + [MetaMask/metamask-extension#25164](https://github.com/MetaMask/metamask-extension/pull/25164) + +--- + +### Race Conditions with Assertions within the Test Body Steps + +- **Assert element value as soon as we find the element — the real value has not been rendered** + [MetaMask/metamask-extension#23450](https://github.com/MetaMask/metamask-extension/pull/23450) + +- **Rapid input of the entire Chain ID resulted in the error message appearing and persisting** + [MetaMask/metamask-extension#24790](https://github.com/MetaMask/metamask-extension/pull/24790) + +- **Trying to find a pending transaction and then a confirmed one — bad pattern as we shouldn't look for transient elements. Looking for the confirmed tx gives us the assertion we want** + [MetaMask/metamask-extension#25545](https://github.com/MetaMask/metamask-extension/pull/25545) + +- **Assert the currentUrl is the desired one can create a race condition. The correct approach is to wait for the URL we want** + [MetaMask/metamask-extension#26651](https://github.com/MetaMask/metamask-extension/pull/26651) + +- **Find an element and then assert it has the correct status (enabled) creates a race condition. Need to wait for the desired state instead of asserting directly** + [MetaMask/metamask-extension#27017](https://github.com/MetaMask/metamask-extension/pull/27017) + +- **Find an element and then assert it has the correct value creates a race condition. Need to wait for the desired value** + [MetaMask/metamask-extension#27095](https://github.com/MetaMask/metamask-extension/pull/27095) + +- **Find an element and assert it has the correct text for dapp permissions** + [MetaMask/metamask-extension#27894](https://github.com/MetaMask/metamask-extension/pull/27894) + +- **Looking for the Deposit transaction by its text in the activity tab — this element updates its state from pending to confirmed, meaning it can become stale** + [MetaMask/metamask-extension#27889](https://github.com/MetaMask/metamask-extension/pull/27889) + +- **Asserting an element is displayed after looking for its selector can cause race conditions where the element is updated in between (e.g., tx from pending to confirmed)** + [MetaMask/metamask-extension#27928](https://github.com/MetaMask/metamask-extension/pull/27928/files) + +- **Find element and assert correct text in the Swaps STX spec** + [MetaMask/metamask-extension#32032](https://github.com/MetaMask/metamask-extension/pull/32032) + +- **Find element and assert correct text in wallet_invokeMethod multichain test** + [MetaMask/metamask-extension#32962](https://github.com/MetaMask/metamask-extension/pull/32962) + +--- + +### Race Conditions with Windows + +- **Vault decrypt uses a production build which automatically opens a MetaMask window. Using driver.navigate too caused 2 MetaMask windows, leading to flakiness as the active browser window was not where driver actions were happening** + [MetaMask/metamask-extension#25443](https://github.com/MetaMask/metamask-extension/pull/25443) + +- **Getting all windows and after several steps referencing an old window** + [MetaMask/metamask-extension#2585](https://github.com/MetaMask/metamask-extension/pull/2585) + +- **Tests that click a button in the popup window that eventually closes it, but don't wait for the popup to close before continuing. Added a method that clicks and waits for the window to close** + [MetaMask/metamask-extension#26449](https://github.com/MetaMask/metamask-extension/pull/26449), + [MetaMask/metamask-extension#26725](https://github.com/MetaMask/metamask-extension/pull/26725) + +- **Triggering a Send from Dapp 1 and quickly switching to Dapp 0 — the network for the first Send is taken from Dapp 0 instead of Dapp 1** + [MetaMask/metamask-extension#26794](https://github.com/MetaMask/metamask-extension/pull/26794) + +- **chainId proxy sync should preserve per dapp network selections** + [MetaMask/metamask-extension#31599](https://github.com/MetaMask/metamask-extension/pull/31599) + +- **Multichain API Call wallet_createSession** + [MetaMask/metamask-extension#31603](https://github.com/MetaMask/metamask-extension/pull/31603) + +- **Snaps race condition with windows** + [MetaMask/metamask-extension#32320](https://github.com/MetaMask/metamask-extension/pull/32320) + +- **Snap cronjobs dialog appears and disappears after some seconds — needed specific assert handling for the case where the window was closed automatically** + [MetaMask/metamask-extension#33004](https://github.com/MetaMask/metamask-extension/pull/33004) + +- **Need to wait until the dialog is closed before performing the next action in Request Queuing tests** + [MetaMask/metamask-extension#34141](https://github.com/MetaMask/metamask-extension/pull/34141) + +--- + +### Race Conditions with React Re-renders + +- **After changing the language, clicking on the account menu while MetaMask is in a loading state — click takes no effect as the component re-renders** + [MetaMask/metamask-extension#25648](https://github.com/MetaMask/metamask-extension/pull/25648) + +- **Checkbox component for Snap Insights Signatures is re-rendered when the host value is loaded, making the checkbox unchecked if the click happens before the re-render** + [MetaMask/metamask-extension#27007](https://github.com/MetaMask/metamask-extension/pull/27007) + +- **The Add account modal needs to finish rendering the account list before proceeding with a click action — otherwise the re-render causes the click to be performed outside the popup, closing the modal** + [MetaMask/metamask-extension#27420](https://github.com/MetaMask/metamask-extension/pull/27420) + +- **In the onboarding flow, clicking an element when it's moving causes the click to take no effect. Added a new driver method to wait until the element is not moving** + [MetaMask/metamask-extension#27921](https://github.com/MetaMask/metamask-extension/pull/27921) + +- **In the carousel spec, looking for an element and then using `.click` — a re-render in between made the element stale. Should use the custom `clickElement` driver method** + [MetaMask/metamask-extension#33362](https://github.com/MetaMask/metamask-extension/pull/33362) + +--- + +### Actions that Take Time + +- **Requests to Sentry take time — if the wait time is not enough, tests will be flaky** + [MetaMask/metamask-extension#26648](https://github.com/MetaMask/metamask-extension/pull/26648) + +- **Chrome takes time to write to .log files (storage) — vault decrypt test was flaky when trying to import the log file before it was finished writing** + [MetaMask/metamask-extension#26612](https://github.com/MetaMask/metamask-extension/pull/26612) + +- **The Connect action takes several seconds — the default timeout for the next action was not enough** + [MetaMask/metamask-extension#26792](https://github.com/MetaMask/metamask-extension/pull/26792) + +- **After going to metamask.io with Marketing feature enabled, the cookie id takes time to be added into MetaMask state** + [MetaMask/metamask-extension#26697](https://github.com/MetaMask/metamask-extension/pull/26697/files#diff-b1c4086e548781d946ed142c838710286d74e7043c5b7b0edce4e5f617091a52R73) + +- **Some `it` blocks are really long leading to timeout issues — not because the test fails, but because the 80000ms threshold is reached** + [MetaMask/metamask-extension#30044](https://github.com/MetaMask/metamask-extension/pull/30044) + +- **Metrics events can get unordered if 2 actions are done subsequently very fast, leading to the 2nd event being the first one triggered** + [MetaMask/metamask-extension#30031](https://github.com/MetaMask/metamask-extension/pull/30031) + +- **Importing a function from another spec file causes the tests from that spec file to also be run, causing long test runs and possible timeouts** + [MetaMask/metamask-extension#30481](https://github.com/MetaMask/metamask-extension/pull/30481) + +- **Chain id is not immediately set when we land on the home page. For actions that rely on chain id, should wait until the balance is loaded** + [MetaMask/metamask-extension#31348](https://github.com/MetaMask/metamask-extension/pull/31348) + +- **Creating an account takes a few seconds to be loaded. Performing a subsequent action right away without checking can create race conditions (e.g., switching to Solana shows a dialog warning about missing Solana account)** + [MetaMask/metamask-extension#31382](https://github.com/MetaMask/metamask-extension/pull/31382) + +- **On the Swap page with a default token, adding an amount triggers quotes. Changing to a custom token before quotes finalize can load quotes for the previous token swap** + [MetaMask/metamask-extension#32233](https://github.com/MetaMask/metamask-extension/pull/32233) + +- **Re-starting the wallet after the vault was corrupt** + [MetaMask/metamask-extension#33591](https://github.com/MetaMask/metamask-extension/pull/33591) + +- **Scroll to bottom using the arrow button takes several seconds for the button to disappear (wallet-side bug)** + [MetaMask/metamask-extension#33493](https://github.com/MetaMask/metamask-extension/pull/33493) + +- **Writing to the local storage file takes time — Vault Decryptor test flaky because sometimes the backup file was empty on upload** + [MetaMask/metamask-extension#33646](https://github.com/MetaMask/metamask-extension/pull/33646) + +- **Request to Profile Sync after onboarding takes seconds — locking the wallet before this request causes "unable to proceed, wallet is locked" error** + [MetaMask/metamask-extension#33763](https://github.com/MetaMask/metamask-extension/pull/33763) + +- **After login, Authentication API requests take time to be triggered. Locking the wallet before they happen causes "wallet is locked" error** + [MetaMask/metamask-extension#34888](https://github.com/MetaMask/metamask-extension/pull/34888) + +- **Triggering several transactions from different dapps without waiting individually can cause transactions to appear in a different order** + [MetaMask/metamask-extension#35944](https://github.com/MetaMask/metamask-extension/pull/35944) + +--- + +### Errors in the testing dapp + +- **A span element is nested inside the buttons for all Snap test e2e buttons — causes flakiness when interacting with the button. Fixed on the snap test dapp side** + [MetaMask/snaps#2782](https://github.com/MetaMask/snaps/pull/2782) + Related: [#27597](https://github.com/MetaMask/metamask-extension/issues/27597), [#27576](https://github.com/MetaMask/metamask-extension/issues/27576), [#26804](https://github.com/MetaMask/metamask-extension/issues/26804) + +- **Phishing detection page adds event listener later on, making the click to the malicious link do nothing** + [MetaMask/phishing-warning#173](https://github.com/MetaMask/phishing-warning/pull/173) + +--- + +### Not using driver methods + +- **Using `element.click()` instead of `clickElement()` can cause race conditions when the element is present but not clickable. The driver function has appropriate guards in place** + [MetaMask/metamask-extension#27599](https://github.com/MetaMask/metamask-extension/pull/27599) + +--- + +## Bugs Discovered on the Wallet Level while Investigating Flaky Tests + +- **Send - ENS resolution displays different address length previews** + [MetaMask/metamask-extension#25286](https://github.com/MetaMask/metamask-extension/issues/25286) + +- **Tokens - MM breaks with "Can't convert undefined to object"** + [MetaMask/metamask-extension#25266](https://github.com/MetaMask/metamask-extension/issues/25266) + +- **Gas - Race condition where gas is not updated after switching assets and going to the last Confirmation screen** + [MetaMask/metamask-extension#25243](https://github.com/MetaMask/metamask-extension/issues/25243) + +- **Assets - Importing an ERC1155 token throws "Contract does not support ERC721 metadata interface"** + [MetaMask/metamask-extension#24988](https://github.com/MetaMask/metamask-extension/issues/24988) + +- **Tokens - Cannot import a token ERC1155 if the IPFS call for the metadata takes long** + [MetaMask/metamask-extension#24710](https://github.com/MetaMask/metamask-extension/issues/24710) + +- **Onboarding rerouting when createNewAccount flow** + [MetaMask/metamask-extension#24874](https://github.com/MetaMask/metamask-extension/pull/24874) + +- **Announcements - NFT autodetection modal overlays Token autodetection modal** + [MetaMask/metamask-extension#25465](https://github.com/MetaMask/metamask-extension/issues/25465) + +- **Settings - Changing the app locale re-renders the state two times and displays the loading spinner 2 times** + [MetaMask/metamask-extension#25651](https://github.com/MetaMask/metamask-extension/issues/25651) + +- **Hardware Wallet - Going to the hardware wallet add account page in Firefox re-renders the state two times** + [MetaMask/metamask-extension#25851](https://github.com/MetaMask/metamask-extension/issues/25851) + +- **Race condition changes order in which transactions are displayed** + [MetaMask/metamask-extension#25251](https://github.com/MetaMask/metamask-extension/issues/25251) + +- **Assets - Add token doesn't close the MM dialog after Adding it (MMI-only)** + [MetaMask/metamask-extension#27854](https://github.com/MetaMask/metamask-extension/issues/27854) + +- **Wallet API - When connecting to the test dapp for the first time, switched to Mainnet automatically despite not having this network selected (Release Blocker)** + [MetaMask/metamask-extension#27891](https://github.com/MetaMask/metamask-extension/issues/27891) + +- **Network Switch - After switching networks for the first time, "Network switched" dialog sometimes appears and sometimes doesn't** + [MetaMask/metamask-extension#27870](https://github.com/MetaMask/metamask-extension/issues/27870) + +- **BTC Accounts - Portfolio link is not displayed when we have a BTC Account selected** + [MetaMask/metamask-extension#28185](https://github.com/MetaMask/metamask-extension/issues/28185) + +- **Blockaid security validation can be bypassed with race condition** + [Slack thread](https://consensys.slack.com/archives/C029JG63136/p1731690020573439?thread_ts=1729246801.516029&cid=C029JG63136) + +- **Wallet API queuing bug not fully fixed** + [Slack thread](https://consensys.slack.com/archives/CTQAGKY5V/p1731693702380099?thread_ts=1731579667.780579&cid=CTQAGKY5V) + +- **Balance polling starting with a locked wallet makes balance load forever when unlocked (until MM is refreshed)** + [commit 9aff235](https://github.com/MetaMask/metamask-extension/pull/28277/commits/9aff235d168598ac0c4da763a6eef0b7c7002212) + +- **Gas controls to edit Max base fee and Priority Fee do not support decimal point on Mac (test passed on Linux but not Mac)** + [MetaMask/metamask-extension#28843](https://github.com/MetaMask/metamask-extension/issues/28843) + +- **Send - When pasting an address without 0x prefix, the input is normalized but the Continue button remains disabled** + [MetaMask/metamask-extension#30349](https://github.com/MetaMask/metamask-extension/issues/30349) + +- **Gas API - Starting a transaction/swap makes a request to /networks/1/gasPrices even if not on Ethereum Mainnet** + [MetaMask/metamask-extension#33377](https://github.com/MetaMask/metamask-extension/issues/33377) + +- **Survey - 2 identical requests are made to the surveys endpoint whenever we start the wallet for the 1st time** + [MetaMask/metamask-extension#33604](https://github.com/MetaMask/metamask-extension/issues/33604) + +- **Error is re-thrown causing duplicated Error key (e.g., "Error: Error: Unable to find value of key...")** + [MetaMask/metamask-extension#34867](https://github.com/MetaMask/metamask-extension/issues/34867) + +- **Accounts - Repeated API GET request to profile/lineage after login** + [MetaMask/metamask-extension#34938](https://github.com/MetaMask/metamask-extension/issues/34938) + +- **ENS resolution - IPFS resolves domains before I've onboarded to the wallet** + [MetaMask/metamask-extension#35675](https://github.com/MetaMask/metamask-extension/issues/35675) + +- **Onboarding - Metametrics page sometimes appears on Chrome browser for Social login** + [MetaMask/metamask-extension#36070](https://github.com/MetaMask/metamask-extension/issues/36070) + +- **Accounts state not updated immediately after create-password** + [MetaMask/metamask-extension#36395](https://github.com/MetaMask/metamask-extension/pull/36395) + +--- + +## E2E Anti-Patterns + +- **Directly asserting element values by text without waiting for those text to be present** using `assert(element.getText(), expected text)` + [MetaMask/metamask-extension#19870](https://github.com/MetaMask/metamask-extension/issues/19870) + +- **Looking for an element and then asserting it's displayed** with `assert.equal(await elem.isDisplayed(), true)` — causes race conditions if the element updates between the lookup and assertion (e.g., transaction changes from pending to confirmed, throwing "stale element" error) + [MetaMask/metamask-extension#27928](https://github.com/MetaMask/metamask-extension/pull/27928/files#r1805186006) + +- **Using `element.click()` instead of `clickElement()`** — looking for the element and then using `.click` can cause race conditions if the element re-renders and becomes stale. The `clickElement` driver method has a guard for this + [MetaMask/metamask-extension#27599](https://github.com/MetaMask/metamask-extension/pull/27599) + +- **Going to live sites** (portfolio dapp, snap dapp, vault decrypt page) instead of using mocks + > Note: [a catch-all mock PR](https://github.com/MetaMask/metamask-extension/) exists, but currently 130+ specs fail because they rely on live requests. Once fixed and merged, it won't be possible to introduce changes without adding corresponding mocks. + +- **Adding delays instead of waiting for conditions**, whenever possible + +- **Importing a function from another spec file** — this causes the tests from that spec file import to also be run, causing long test runs and possible timeouts (>80000ms) + [MetaMask/metamask-extension#30481](https://github.com/MetaMask/metamask-extension/pull/30481/files#r1965313492) + +--- + +## Unit Test Flakiness Categories + +- **A property of the store is sometimes undefined** + [MetaMask/metamask-extension#27941](https://github.com/MetaMask/metamask-extension/pull/27941) + +--- + +## Flakiness on Other CI Jobs + +- **The lint-lockfile job is flaky as it's under-resourced** — fixed by changing resources from medium to medium-plus + [MetaMask/metamask-extension#27950](https://github.com/MetaMask/metamask-extension/pull/27950) + +- **Rate limited by yarnpkg returning 429 Too Many Requests** — makes any job dependent on yarn fail + [Slack thread](https://consensys.slack.com/archives/CTQAGKY5V/p1747406828996759) diff --git a/.github/scripts/analyze-flaky-tests/llm/claude-analyzer.ts b/.github/scripts/analyze-flaky-tests/llm/claude-analyzer.ts new file mode 100644 index 00000000..6573c3ff --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/llm/claude-analyzer.ts @@ -0,0 +1,136 @@ +import Anthropic from '@anthropic-ai/sdk'; +import type { AnalysisResult, FlakyTestFailure } from '../types'; +import { getToolDefinitions, executeToolCall } from './tools'; +import type { ToolContext } from './tools'; + +const MODEL = 'claude-sonnet-4-20250514'; +const MAX_TOKENS = 4096; +const MAX_ITERATIONS = 10; + +function extractAnalysisFromToolCall( + content: Anthropic.Messages.ContentBlock[], + failure: FlakyTestFailure, +): AnalysisResult | null { + for (const block of content) { + if (block.type === 'tool_use' && block.name === 'submit_analysis') { + const input = block.input as Record; + return { + testName: failure.name, + testPath: failure.path, + classification: (input.classification as AnalysisResult['classification']) ?? 'flaky_test', + confidence: typeof input.confidence === 'number' ? input.confidence : 50, + rootCauseCategory: (input.rootCauseCategory as string) ?? 'other', + rootCauseExplanation: (input.rootCauseExplanation as string) ?? 'Unable to determine root cause.', + specificLines: Array.isArray(input.specificLines) ? (input.specificLines as string[]) : [], + suggestedFix: (input.suggestedFix as string) ?? 'No suggestion available.', + additionalNotes: (input.additionalNotes as string) ?? '', + }; + } + } + return null; +} + +export async function analyzeWithClaude( + initialPrompt: string, + failure: FlakyTestFailure, + apiKey: string, + toolContext: ToolContext, +): Promise { + const client = new Anthropic({ apiKey }); + const tools = getToolDefinitions(toolContext.owner, toolContext.repo); + const messages: Anthropic.Messages.MessageParam[] = [ + { role: 'user', content: initialPrompt }, + ]; + + for (let iteration = 0; iteration < MAX_ITERATIONS; iteration++) { + const response = await client.messages.create({ + model: MODEL, + max_tokens: MAX_TOKENS, + tools, + messages, + }); + + const submittedAnalysis = extractAnalysisFromToolCall(response.content, failure); + if (submittedAnalysis) { + const submitBlock = response.content.find( + (b) => b.type === 'tool_use' && b.name === 'submit_analysis', + ); + if (submitBlock && submitBlock.type === 'tool_use') { + messages.push({ role: 'assistant', content: response.content }); + messages.push({ + role: 'user', + content: [ + { + type: 'tool_result', + tool_use_id: submitBlock.id, + content: 'Analysis received. Thank you.', + }, + ], + }); + } + return submittedAnalysis; + } + + if (response.stop_reason === 'end_turn') { + const textBlock = response.content.find((b) => b.type === 'text'); + if (textBlock && textBlock.type === 'text') { + try { + let cleaned = textBlock.text.trim(); + const jsonMatch = cleaned.match(/```(?:json)?\s*([\s\S]*?)```/); + if (jsonMatch?.[1]) cleaned = jsonMatch[1].trim(); + const parsed = JSON.parse(cleaned) as Record; + return { + testName: failure.name, + testPath: failure.path, + classification: (parsed.classification as AnalysisResult['classification']) ?? 'flaky_test', + confidence: typeof parsed.confidence === 'number' ? parsed.confidence : 50, + rootCauseCategory: (parsed.rootCauseCategory as string) ?? 'other', + rootCauseExplanation: (parsed.rootCauseExplanation as string) ?? 'Unable to determine root cause.', + specificLines: Array.isArray(parsed.specificLines) ? (parsed.specificLines as string[]) : [], + suggestedFix: (parsed.suggestedFix as string) ?? 'No suggestion available.', + additionalNotes: (parsed.additionalNotes as string) ?? '', + }; + } catch { + throw new Error(`Claude ended without calling submit_analysis. Raw response: ${textBlock.text.substring(0, 200)}`); + } + } + throw new Error('Claude ended conversation without producing an analysis.'); + } + + if (response.stop_reason !== 'tool_use') { + throw new Error(`Unexpected stop_reason: ${response.stop_reason}`); + } + + const toolUseBlocks = response.content.filter( + (block): block is Anthropic.Messages.ToolUseBlock => block.type === 'tool_use', + ); + + const toolResults: Anthropic.Messages.ToolResultBlockParam[] = []; + for (const toolUse of toolUseBlocks) { + if (toolUse.name === 'submit_analysis') continue; + + console.log(` [tool] ${toolUse.name}(${JSON.stringify(toolUse.input).substring(0, 100)})`); + + const result = await executeToolCall( + toolUse.name, + toolUse.input as Record, + toolContext, + ); + + const truncated = result.length > 15000 + ? `${result.substring(0, 15000)}\n... (truncated, ${result.length} chars total)` + : result; + + toolResults.push({ + type: 'tool_result', + tool_use_id: toolUse.id, + content: truncated, + }); + } + + messages.push({ role: 'assistant', content: response.content }); + messages.push({ role: 'user', content: toolResults }); + } + + throw new Error(`Analysis did not complete within ${MAX_ITERATIONS} iterations.`); +} diff --git a/.github/scripts/analyze-flaky-tests/llm/prompt-builder.ts b/.github/scripts/analyze-flaky-tests/llm/prompt-builder.ts new file mode 100644 index 00000000..b0eafd27 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/llm/prompt-builder.ts @@ -0,0 +1,50 @@ +import type { FlakyTestFailure } from '../types'; + +/** + * Builds the initial prompt for the agentic analysis loop. + * Only includes the failure metadata and CI log -- the agent uses tools + * to fetch source code, knowledge base sections, and past fixes on demand. + */ +export function buildInitialPrompt( + failure: FlakyTestFailure, + logSection: string, + owner: string, + repo: string, +): string { + const classification = failure.isFlaky + ? 'Flaky (passed after retry)' + : 'Real failure'; + + const repoRef = `${owner}/${repo}`; + + return `You are an expert at diagnosing flaky E2E tests in the ${repoRef} repository. + +You have tools available to investigate this failure. Use them to: +1. Fetch the test source file and any page objects or helpers it imports (use search_test_file first if you're unsure of the exact path) +2. Look up relevant flakiness patterns from the knowledge base (list categories first, then fetch specific ones) +3. Search for similar past fixes if applicable +4. Fetch CI job logs if you need more context about the failure (use fetch_job_logs with run_id to discover jobs, then fetch specific job logs) + +## Failure Information +- Test name: ${failure.name} +- Test file: ${failure.path} +- Error message: ${failure.lastError} +- Times failed: ${failure.realFailures} real failures, ${failure.totalRetries} retries +- Classification: ${classification} +- Run ID: ${failure.runId ?? 'N/A'} +- Job ID: ${failure.jobId ?? 'N/A'} + +## Full Error + Stack Trace from CI Logs +\`\`\` +${logSection} +\`\`\` + +## Investigation Guidelines +- Most failures in this report are flaky tests, not app bugs +- If the test file path returns "not found", use search_test_file to discover the correct path +- Use list_flakiness_categories and get_flakiness_patterns to learn about known flakiness patterns and anti-patterns for this repository +- Fetch the test source, read its imports, and fetch relevant page objects or helpers +- Search for similar past fixes if applicable + +Start by fetching the test file at "${failure.path}". If it's not found, use search_test_file to find the correct path. Then investigate as needed. When done, call submit_analysis with your findings.`; +} diff --git a/.github/scripts/analyze-flaky-tests/llm/tools.ts b/.github/scripts/analyze-flaky-tests/llm/tools.ts new file mode 100644 index 00000000..0324fa1a --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/llm/tools.ts @@ -0,0 +1,284 @@ +import type { Octokit } from '@octokit/rest'; +import type Anthropic from '@anthropic-ai/sdk'; +import { fetchFileContent, searchTestFiles } from '../utils/test-source-reader'; +import { searchFixesByKeyword } from '../utils/past-fixes-fetcher'; +import { fetchJobLogById, listE2eJobs } from '../utils/job-log-fetcher'; +import { + getKnowledgeSection, + listKnowledgeSections, +} from '../utils/knowledge-base'; + +export interface ToolContext { + octokit: Octokit; + owner: string; + repo: string; +} + +export function getToolDefinitions( + owner: string, + repo: string, +): Anthropic.Messages.Tool[] { + const repoRef = `${owner}/${repo}`; + + return [ + { + name: 'fetch_file', + description: + `Fetch the contents of a file from the ${repoRef} repository. ` + + 'Use this to read test files, page objects, helpers, fixtures, or any source code you need to investigate.', + input_schema: { + type: 'object' as const, + properties: { + path: { + type: 'string', + description: + 'File path relative to the repo root, e.g. "test/e2e/tests/connections/edit-account-permissions.spec.ts"', + }, + }, + required: ['path'], + }, + }, + { + name: 'search_test_file', + description: + `Search for test files in ${repoRef} by name or keyword. ` + + 'Returns matching file paths under test/e2e/. Use when you do not know the exact path or when fetch_file returns "not found".', + input_schema: { + type: 'object' as const, + properties: { + query: { + type: 'string', + description: + 'Keyword(s) to search for in file paths, e.g. "ens" or "refresh-auth" or "cronjob spec"', + }, + }, + required: ['query'], + }, + }, + { + name: 'fetch_job_logs', + description: + `Fetch GitHub Actions job logs from ${repoRef}. ` + + 'Provide either a job_id to fetch logs directly, or a run_id to list all e2e jobs in that workflow run. ' + + 'Optionally provide test_name to extract just the relevant failure section from the logs.', + input_schema: { + type: 'object' as const, + properties: { + job_id: { + type: 'number', + description: 'Specific GitHub Actions job ID to fetch logs for', + }, + run_id: { + type: 'number', + description: 'Workflow run ID -- lists all e2e test jobs so you can pick one to fetch logs from', + }, + test_name: { + type: 'string', + description: 'Test name to search for in logs (narrows the log output to the relevant failure)', + }, + }, + }, + }, + { + name: 'get_flakiness_patterns', + description: + 'Get a specific section from the flakiness knowledge base. ' + + 'Each section documents a category of flakiness with real examples and fix PRs. ' + + 'Use a keyword to match a section (e.g. "race conditions", "mocks", "popups", "windows", "re-renders", "assertions", "anti-patterns").', + input_schema: { + type: 'object' as const, + properties: { + category: { + type: 'string', + description: + 'Keyword to match a knowledge base section, e.g. "race conditions windows", "mocks", "popups modals", "assertions", "anti-patterns"', + }, + }, + required: ['category'], + }, + }, + { + name: 'list_flakiness_categories', + description: + 'List all available section headings in the flakiness knowledge base. ' + + 'Call this first to discover what categories are available before requesting a specific one.', + input_schema: { + type: 'object' as const, + properties: {}, + }, + }, + { + name: 'search_similar_fixes', + description: + `Search for merged pull requests in ${repoRef} that fixed similar flaky test issues. ` + + 'Returns PR titles and diffs filtered to test file changes. Use keywords from the error message or test pattern.', + input_schema: { + type: 'object' as const, + properties: { + query: { + type: 'string', + description: + 'Search keyword(s) to find similar past fixes, e.g. "stale element", "waitForSelector", "click intercepted", or the test file name', + }, + }, + required: ['query'], + }, + }, + { + name: 'submit_analysis', + description: + 'Submit the final analysis of the flaky test failure. Call this exactly once when you have completed your investigation.', + input_schema: { + type: 'object' as const, + properties: { + classification: { + type: 'string', + enum: ['flaky_test', 'app_bug', 'infra_issue'], + description: 'The type of failure', + }, + confidence: { + type: 'number', + description: 'Confidence level 0-100', + }, + rootCauseCategory: { + type: 'string', + enum: [ + 'timing', + 'element_state', + 'network_race', + 'stale_reference', + 'state_leakage', + 'animation', + 'missing_mock', + 'unnecessary_steps', + 'window_race', + 'react_rerender', + 'popup_modal', + 'other', + ], + description: 'The category of root cause', + }, + rootCauseExplanation: { + type: 'string', + description: '2-3 sentence explanation of what is causing the flakiness', + }, + specificLines: { + type: 'array', + items: { type: 'string' }, + description: 'Line numbers or code snippets causing the issue', + }, + suggestedFix: { + type: 'string', + description: 'Detailed description of the fix with before/after code', + }, + additionalNotes: { + type: 'string', + description: 'Any other observations', + }, + }, + required: [ + 'classification', + 'confidence', + 'rootCauseCategory', + 'rootCauseExplanation', + 'specificLines', + 'suggestedFix', + 'additionalNotes', + ], + }, + }, + ]; +} + +export async function executeToolCall( + toolName: string, + toolInput: Record, + context: ToolContext, +): Promise { + switch (toolName) { + case 'fetch_file': { + const filePath = toolInput.path as string; + const content = await fetchFileContent( + context.octokit, + context.owner, + context.repo, + filePath, + ); + return content ?? `File not found or could not be fetched: ${filePath}`; + } + + case 'search_test_file': { + const query = toolInput.query as string; + const matches = await searchTestFiles( + context.octokit, + context.owner, + context.repo, + query, + ); + if (matches.length === 0) { + return `No test files found matching "${query}". Try broader keywords.`; + } + return `Found ${matches.length} matching test file(s):\n${matches.map((p) => `- ${p}`).join('\n')}`; + } + + case 'fetch_job_logs': { + const jobId = toolInput.job_id as number | undefined; + const runId = toolInput.run_id as number | undefined; + const testName = toolInput.test_name as string | undefined; + + if (jobId) { + return fetchJobLogById(context.octokit, context.owner, context.repo, jobId, testName); + } + + if (runId) { + const jobs = await listE2eJobs(context.octokit, context.owner, context.repo, runId); + if (jobs.length === 0) { + return `No e2e test jobs found in run ${runId}.`; + } + return ( + `Found ${jobs.length} e2e job(s) in run ${runId}:\n` + + jobs.map((j) => `- Job ${j.id}: ${j.name} (conclusion: ${j.conclusion ?? 'running'})`).join('\n') + + '\n\nCall fetch_job_logs again with a specific job_id to get logs.' + ); + } + + return 'Provide either job_id or run_id.'; + } + + case 'get_flakiness_patterns': { + const category = toolInput.category as string; + return getKnowledgeSection(category); + } + + case 'list_flakiness_categories': { + const sections = listKnowledgeSections(); + return `Available knowledge base sections:\n${sections.map((s) => `- ${s}`).join('\n')}`; + } + + case 'search_similar_fixes': { + const query = toolInput.query as string; + const fixes = await searchFixesByKeyword( + context.octokit, + context.owner, + context.repo, + query, + ); + if (fixes.length === 0) { + return `No merged flaky test fix PRs found matching "${query}".`; + } + return fixes + .map( + (fix) => + `### PR #${fix.prNumber}: ${fix.title}\n\`\`\`diff\n${fix.diffContent}\n\`\`\``, + ) + .join('\n\n'); + } + + case 'submit_analysis': { + return JSON.stringify(toolInput); + } + + default: + return `Unknown tool: ${toolName}`; + } +} diff --git a/.github/scripts/analyze-flaky-tests/types.ts b/.github/scripts/analyze-flaky-tests/types.ts new file mode 100644 index 00000000..4f5724c4 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/types.ts @@ -0,0 +1,45 @@ +export interface FlakyTestFailure { + name: string; + path: string; + realFailures: number; + totalRetries: number; + lastError: string; + jobId: number; + runId: number; + suite: string; + isFlaky: boolean; +} + +export interface AnalysisResult { + testName: string; + testPath: string; + classification: 'flaky_test' | 'app_bug' | 'infra_issue'; + confidence: number; + rootCauseCategory: string; + rootCauseExplanation: string; + specificLines: string[]; + suggestedFix: string; + additionalNotes: string; +} + +export interface SlackFinding { + failure: FlakyTestFailure; + analysis: AnalysisResult; + jobUrl: string; + fileUrl: string; +} + +export interface TestSourceContext { + testFileContent: string; + testFilePath: string; + pageObjects: Array<{ + path: string; + content: string; + }>; +} + +export interface PastFixExample { + prNumber: number; + title: string; + diffContent: string; +} diff --git a/.github/scripts/analyze-flaky-tests/utils/job-log-fetcher.ts b/.github/scripts/analyze-flaky-tests/utils/job-log-fetcher.ts new file mode 100644 index 00000000..9a8b7bb7 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/utils/job-log-fetcher.ts @@ -0,0 +1,140 @@ +import type { Octokit } from '@octokit/rest'; +import type { FlakyTestFailure } from '../types'; + +const CONTEXT_LINES = 100; + +const ERROR_PATTERNS = [ + /Error:/i, + /AssertionError:/i, + /TimeoutError:/i, + /AssertionError/i, + /at\s+.*\.(spec|test)\.(ts|js)/, +]; + +function extractRelevantLogSection( + logText: string, + testName: string, +): string { + const lines = logText.split('\n'); + + const testNameIndex = lines.findIndex((line) => line.includes(testName)); + + let errorIndex = -1; + if (testNameIndex !== -1) { + for (let i = testNameIndex; i < Math.min(testNameIndex + 50, lines.length); i++) { + if (ERROR_PATTERNS.some((pattern) => pattern.test(lines[i] ?? ''))) { + errorIndex = i; + break; + } + } + } + + const anchorIndex = errorIndex !== -1 ? errorIndex : testNameIndex; + + if (anchorIndex === -1) { + let lastErrorIndex = -1; + for (let i = lines.length - 1; i >= 0; i--) { + if (ERROR_PATTERNS.some((pattern) => pattern.test(lines[i] ?? ''))) { + lastErrorIndex = i; + break; + } + } + if (lastErrorIndex !== -1) { + const start = Math.max(0, lastErrorIndex - CONTEXT_LINES); + const end = Math.min(lines.length, lastErrorIndex + CONTEXT_LINES); + return lines.slice(start, end).join('\n'); + } + return lines.slice(-200).join('\n'); + } + + const start = Math.max(0, anchorIndex - CONTEXT_LINES); + const end = Math.min(lines.length, anchorIndex + CONTEXT_LINES); + return lines.slice(start, end).join('\n'); +} + +export async function fetchJobLog( + octokit: Octokit, + failure: FlakyTestFailure, + owner: string, + repo: string, +): Promise { + if (!failure.jobId) { + return 'No job ID available for this failure.'; + } + + return fetchJobLogById(octokit, owner, repo, failure.jobId, failure.name); +} + +export async function fetchJobLogById( + octokit: Octokit, + owner: string, + repo: string, + jobId: number, + testName?: string, +): Promise { + try { + const response = await octokit.rest.actions.downloadJobLogsForWorkflowRun({ + owner, + repo, + job_id: jobId, + }); + + const logText = typeof response.data === 'string' + ? response.data + : String(response.data); + + if (testName) { + return extractRelevantLogSection(logText, testName); + } + if (logText.length > 30000) { + return `${logText.substring(0, 30000)}\n... (truncated, ${logText.length} chars total)`; + } + return logText; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Failed to fetch job log for job ${jobId}: ${message}`); + return `Failed to fetch job log: ${message}`; + } +} + +export interface E2eJobInfo { + id: number; + name: string; + conclusion: string | null; + htmlUrl: string; +} + +export async function listE2eJobs( + octokit: Octokit, + owner: string, + repo: string, + runId: number, +): Promise { + const jobs: E2eJobInfo[] = []; + + for (let page = 1; page <= 5; page++) { + const { data } = await octokit.rest.actions.listJobsForWorkflowRun({ + owner, + repo, + run_id: runId, + per_page: 100, + page, + }); + + for (const job of data.jobs) { + const nameLower = job.name.toLowerCase(); + if (nameLower.includes('e2e')) { + jobs.push({ + id: job.id, + name: job.name, + conclusion: job.conclusion ?? null, + htmlUrl: job.html_url ?? '', + }); + } + } + + if (data.jobs.length < 100) break; + } + + return jobs; +} diff --git a/.github/scripts/analyze-flaky-tests/utils/knowledge-base.ts b/.github/scripts/analyze-flaky-tests/utils/knowledge-base.ts new file mode 100644 index 00000000..898536e9 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/utils/knowledge-base.ts @@ -0,0 +1,71 @@ +import fs from 'fs'; +import path from 'path'; + +const KNOWLEDGE_PATH = path.join( + __dirname, + '..', + 'knowledge', + 'extension-flakiness-patterns.md', +); + +let cachedContent: string | null = null; + +function loadKnowledgeBase(): string { + if (cachedContent !== null) return cachedContent; + try { + cachedContent = fs.readFileSync(KNOWLEDGE_PATH, 'utf-8'); + return cachedContent; + } catch { + console.error('Warning: Could not load extension-flakiness-patterns.md'); + cachedContent = ''; + return ''; + } +} + +/** + * Returns all available section names (## and ### headings) from the knowledge base. + */ +export function listKnowledgeSections(): string[] { + const content = loadKnowledgeBase(); + const headingRegex = /^#{2,3}\s+(.+)$/gm; + const sections: string[] = []; + let match; + while ((match = headingRegex.exec(content)) !== null) { + if (match[1]) sections.push(match[1]); + } + return sections; +} + +/** + * Searches the knowledge base for a section matching the given category keyword. + * Matches against ## headers and returns the full section content up to the next ## header. + */ +export function getKnowledgeSection(category: string): string { + const content = loadKnowledgeBase(); + if (!content) return 'Knowledge base not available.'; + + const normalizedQuery = category.toLowerCase().replace(/[_-]/g, ' '); + + const sectionRegex = /^## (.+)$/gm; + const sectionStarts: Array<{ title: string; index: number }> = []; + let match; + while ((match = sectionRegex.exec(content)) !== null) { + if (match[1]) { + sectionStarts.push({ title: match[1], index: match.index }); + } + } + + const matched = sectionStarts.find((s) => + s.title.toLowerCase().replace(/[_-]/g, ' ').includes(normalizedQuery), + ); + + if (!matched) { + const allTitles = sectionStarts.map((s) => s.title).join(', '); + return `No section matching "${category}" found. Available sections: ${allTitles}`; + } + + const matchedIdx = sectionStarts.indexOf(matched); + const nextSection = sectionStarts[matchedIdx + 1]; + const sectionEnd = nextSection ? nextSection.index : content.length; + return content.substring(matched.index, sectionEnd).trim(); +} diff --git a/.github/scripts/analyze-flaky-tests/utils/past-fixes-fetcher.ts b/.github/scripts/analyze-flaky-tests/utils/past-fixes-fetcher.ts new file mode 100644 index 00000000..3ee1f06e --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/utils/past-fixes-fetcher.ts @@ -0,0 +1,123 @@ +import type { Octokit } from '@octokit/rest'; +import type { PastFixExample } from '../types'; + +const MAX_PAST_FIXES = 5; +const MAX_DIFF_LENGTH = 5000; + +/** + * Filters a unified diff to only include hunks touching test/e2e files, + * keeping diffs focused and within context budget. + */ +function filterDiffToTestFiles(diff: string): string { + const fileSections = diff.split(/^diff --git /m); + const testSections = fileSections.filter( + (section) => + section.includes('test/e2e/') || + section.includes('page-objects/') || + section.includes('.spec.'), + ); + + const joined = testSections + .map((section) => `diff --git ${section}`) + .join('\n'); + + if (joined.length > MAX_DIFF_LENGTH) { + return `${joined.substring(0, MAX_DIFF_LENGTH)}\n... (diff truncated)`; + } + return joined; +} + +function sanitizeSearchQuery(raw: string): string { + return raw + .replace(/['"\\`{}[\]()]/g, '') + .replace(/\s+/g, ' ') + .trim() + .substring(0, 80); +} + +async function searchPRs( + octokit: Octokit, + owner: string, + repo: string, + query: string, +): Promise { + const { data: searchResults } = await octokit.rest.search.issuesAndPullRequests({ + q: query, + per_page: 3, + }); + + const fixes: PastFixExample[] = []; + + for (const item of searchResults.items) { + try { + const { data: diff } = await octokit.rest.pulls.get({ + owner, + repo, + pull_number: item.number, + mediaType: { format: 'diff' }, + }); + + const diffText = typeof diff === 'string' ? diff : String(diff); + const filteredDiff = filterDiffToTestFiles(diffText); + + if (filteredDiff.trim()) { + fixes.push({ + prNumber: item.number, + title: item.title, + diffContent: filteredDiff, + }); + } + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Failed to fetch diff for PR #${item.number}: ${message}`); + } + } + + return fixes; +} + +export async function searchFixesByKeyword( + octokit: Octokit, + owner: string, + repo: string, + keyword: string, +): Promise { + const sanitized = sanitizeSearchQuery(keyword); + if (!sanitized) return []; + + const queries = [ + `repo:${owner}/${repo} is:pr is:merged "flaky" "${sanitized}"`, + `repo:${owner}/${repo} is:pr is:merged "${sanitized}" test e2e`, + ]; + + for (const q of queries) { + try { + const fixes = await searchPRs(octokit, owner, repo, q); + if (fixes.length > 0) return fixes; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Search query failed ("${q.substring(0, 60)}..."): ${message}`); + } + } + + return []; +} + +export async function fetchPastFixes( + octokit: Octokit, + owner: string, + repo: string, +): Promise { + try { + return await searchPRs( + octokit, + owner, + repo, + `repo:${owner}/${repo} is:pr is:merged "flaky" test e2e sort:updated-desc`, + ); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Failed to search for past flaky test fixes: ${message}`); + return []; + } +} diff --git a/.github/scripts/analyze-flaky-tests/utils/slack-reporter.ts b/.github/scripts/analyze-flaky-tests/utils/slack-reporter.ts new file mode 100644 index 00000000..d2849954 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/utils/slack-reporter.ts @@ -0,0 +1,137 @@ +import { WebClient } from '@slack/web-api'; +import type { SlackFinding } from '../types'; + +function buildSummaryBlocks(findings: SlackFinding[]): object[] { + const flakyCount = findings.filter( + (f) => f.analysis.classification === 'flaky_test', + ).length; + const bugCount = findings.filter( + (f) => f.analysis.classification === 'app_bug', + ).length; + const infraCount = findings.filter( + (f) => f.analysis.classification === 'infra_issue', + ).length; + + return [ + { + type: 'header', + text: { + type: 'plain_text', + text: `AI Analysis of ${findings.length} Flaky Tests`, + emoji: true, + }, + }, + { + type: 'context', + elements: [ + { + type: 'mrkdwn', + text: `Classification: ${flakyCount} flaky tests | ${bugCount} app bugs | ${infraCount} infra issues`, + }, + ], + }, + { type: 'divider' }, + ]; +} + +function truncate(text: string, maxLength: number): string { + if (text.length <= maxLength) return text; + return `${text.substring(0, maxLength)}...`; +} + +function buildFindingBlocks(finding: SlackFinding): object[] { + const { failure, analysis, jobUrl, fileUrl } = finding; + + const classificationEmoji = + analysis.classification === 'flaky_test' + ? ':large_yellow_circle:' + : analysis.classification === 'app_bug' + ? ':red_circle:' + : ':white_circle:'; + + const blocks: object[] = [ + { + type: 'header', + text: { + type: 'plain_text', + text: truncate(`Test: ${failure.name}`, 150), + emoji: true, + }, + }, + { + type: 'context', + elements: [ + { + type: 'mrkdwn', + text: `${classificationEmoji} ${analysis.classification.replace('_', ' ')} | Confidence: ${analysis.confidence}% | Root cause: ${analysis.rootCauseCategory}`, + }, + ], + }, + { + type: 'section', + text: { + type: 'mrkdwn', + text: `*Diagnosis*\n${truncate(analysis.rootCauseExplanation, 2900)}`, + }, + }, + ]; + + if (analysis.specificLines.length > 0) { + blocks.push({ + type: 'section', + text: { + type: 'mrkdwn', + text: `*Problematic Code*\n\`\`\`${truncate(analysis.specificLines.join('\n'), 2900)}\`\`\``, + }, + }); + } + + blocks.push({ + type: 'section', + text: { + type: 'mrkdwn', + text: `*Suggested Fix*\n${truncate(analysis.suggestedFix, 2900)}`, + }, + }); + + blocks.push({ + type: 'context', + elements: [ + { + type: 'mrkdwn', + text: `<${jobUrl}|Job Log> | <${fileUrl}|Test File>`, + }, + ], + }); + + blocks.push({ type: 'divider' }); + + return blocks; +} + +export async function postSlackFindings( + findings: SlackFinding[], + threadTs: string, + botToken: string, + channelId: string, +): Promise { + const slack = new WebClient(botToken); + + const summaryBlocks = buildSummaryBlocks(findings); + await slack.chat.postMessage({ + channel: channelId, + thread_ts: threadTs, + blocks: summaryBlocks, + text: `AI Analysis of ${findings.length} flaky tests`, + }); + + for (const finding of findings) { + const blocks = buildFindingBlocks(finding); + await slack.chat.postMessage({ + channel: channelId, + thread_ts: threadTs, + blocks, + text: `Analysis: ${finding.failure.name}`, + }); + } +} diff --git a/.github/scripts/analyze-flaky-tests/utils/test-source-reader.ts b/.github/scripts/analyze-flaky-tests/utils/test-source-reader.ts new file mode 100644 index 00000000..e0feaed7 --- /dev/null +++ b/.github/scripts/analyze-flaky-tests/utils/test-source-reader.ts @@ -0,0 +1,124 @@ +import path from 'path'; +import type { Octokit } from '@octokit/rest'; +import type { TestSourceContext } from '../types'; + +const MAX_SEARCH_RESULTS = 20; + +export async function searchTestFiles( + octokit: Octokit, + owner: string, + repo: string, + query: string, +): Promise { + const { data: tree } = await octokit.rest.git.getTree({ + owner, + repo, + tree_sha: 'main', + recursive: '1', + }); + + const keywords = query.toLowerCase().split(/\s+/); + return tree.tree + .filter((item) => { + if (item.type !== 'blob' || !item.path) return false; + const p = item.path.toLowerCase(); + if (!p.includes('test/e2e/')) return false; + return keywords.every((kw) => p.includes(kw)); + }) + .map((item) => item.path!) + .slice(0, MAX_SEARCH_RESULTS); +} + +export async function fetchFileContent( + octokit: Octokit, + owner: string, + repo: string, + filePath: string, + ref = 'main', +): Promise { + try { + const response = await octokit.rest.repos.getContent({ + owner, + repo, + path: filePath, + ref, + }); + + if ('content' in response.data && typeof response.data.content === 'string') { + return Buffer.from(response.data.content, 'base64').toString('utf-8'); + } + return null; + } catch { + return null; + } +} + +/** + * Parses import statements from a test file to discover page object and flow + * dependencies. Returns resolved paths relative to the repo root. + */ +function parseImportedPageObjects( + testFileContent: string, + testFilePath: string, +): string[] { + const importRegex = /from\s+['"]([^'"]+)['"]/g; + const pageObjectPaths: string[] = []; + const testDir = path.dirname(testFilePath); + + let match; + while ((match = importRegex.exec(testFileContent)) !== null) { + const importPath = match[1]; + if (!importPath) continue; + + const isPageObjectOrFlow = + importPath.includes('page-objects') || + importPath.includes('pages/') || + importPath.includes('flows/'); + + if (importPath.startsWith('.') && isPageObjectOrFlow) { + let resolved = path.posix.join(testDir, importPath); + if (!resolved.endsWith('.ts') && !resolved.endsWith('.js')) { + resolved += '.ts'; + } + pageObjectPaths.push(resolved); + } + } + + return pageObjectPaths; +} + +export async function fetchTestSource( + octokit: Octokit, + testFilePath: string, + owner: string, + repo: string, + ref = 'main', +): Promise { + const testFileContent = await fetchFileContent(octokit, owner, repo, testFilePath, ref); + + if (!testFileContent) { + return { + testFileContent: `Could not fetch test file: ${testFilePath}`, + testFilePath, + pageObjects: [], + }; + } + + const pageObjectPaths = parseImportedPageObjects(testFileContent, testFilePath); + const pageObjects: TestSourceContext['pageObjects'] = []; + + const fetches = pageObjectPaths.map(async (poPath) => { + const content = await fetchFileContent(octokit, owner, repo, poPath, ref); + if (content) { + pageObjects.push({ path: poPath, content }); + } + }); + + await Promise.all(fetches); + + return { + testFileContent, + testFilePath, + pageObjects, + }; +} diff --git a/.github/scripts/create-flaky-test-report.mjs b/.github/scripts/create-flaky-test-report.mjs index 7aab7582..1b08a2cb 100644 --- a/.github/scripts/create-flaky-test-report.mjs +++ b/.github/scripts/create-flaky-test-report.mjs @@ -2,9 +2,11 @@ // Based on the original script done by @itsyoboieltr on Extension repo +import fs from 'fs'; import { Octokit } from '@octokit/rest'; import unzipper from 'unzipper'; import { IncomingWebhook } from '@slack/webhook'; +import { WebClient } from '@slack/web-api'; const githubToken = process.env.GITHUB_TOKEN; if (!githubToken) throw new Error('Missing GITHUB_TOKEN env var'); @@ -18,6 +20,8 @@ const env = { WORKFLOW_ID: process.env.WORKFLOW_ID || 'ci.yml', BRANCH: process.env.BRANCH || 'main', SLACK_WEBHOOK_FLAKY_TESTS: process.env.SLACK_WEBHOOK_FLAKY_TESTS || '', + SLACK_BOT_TOKEN: process.env.SLACK_BOT_TOKEN || '', + SLACK_CHANNEL_ID: process.env.SLACK_CHANNEL_ID || '', TEST_REPORT_ARTIFACTS: process.env.TEST_REPORT_ARTIFACTS ? process.env.TEST_REPORT_ARTIFACTS.split(',').map(name => name.trim()) : ['test-e2e-android-json-report', 'test-e2e-ios-json-report', 'test-e2e-chrome-report', 'test-e2e-firefox-report'], @@ -326,27 +330,53 @@ function summarizeFailures(realFailures, flakyTests = []) { } async function sendSlackReport(summary, dateDisplay, workflowCount, failedCount) { - if (!env.SLACK_WEBHOOK_FLAKY_TESTS || !env.SLACK_WEBHOOK_FLAKY_TESTS.startsWith('https://')) { - console.log('Skipping Slack notification'); - return; + const useBotToken = env.SLACK_BOT_TOKEN && env.SLACK_CHANNEL_ID; + const useWebhook = env.SLACK_WEBHOOK_FLAKY_TESTS && env.SLACK_WEBHOOK_FLAKY_TESTS.startsWith('https://'); + + if (!useBotToken && !useWebhook) { + console.log('Skipping Slack notification (no SLACK_BOT_TOKEN+SLACK_CHANNEL_ID or SLACK_WEBHOOK_FLAKY_TESTS)'); + return null; } console.log('\n📤 Sending report to Slack...'); + const blocks = createSlackBlocks(summary, dateDisplay, workflowCount, failedCount); + const BATCH_SIZE = 50; + + if (useBotToken) { + try { + const slack = new WebClient(env.SLACK_BOT_TOKEN); + let threadTs = null; + + for (let i = 0; i < blocks.length; i += BATCH_SIZE) { + const batch = blocks.slice(i, i + BATCH_SIZE); + const result = await slack.chat.postMessage({ + channel: env.SLACK_CHANNEL_ID, + blocks: batch, + text: 'Flaky Test Report', + ...(threadTs ? { thread_ts: threadTs } : {}), + }); + if (!threadTs) threadTs = result.ts; + } + + console.log(`✅ Report sent to Slack via WebClient (thread_ts: ${threadTs})`); + return threadTs; + } catch (slackError) { + console.error('❌ Failed to send Slack notification via WebClient:', slackError.message); + return null; + } + } + try { const webhook = new IncomingWebhook(env.SLACK_WEBHOOK_FLAKY_TESTS); - const blocks = createSlackBlocks(summary, dateDisplay, workflowCount, failedCount); - - // Slack has a limit of 50 blocks per message - const BATCH_SIZE = 50; for (let i = 0; i < blocks.length; i += BATCH_SIZE) { const batch = blocks.slice(i, i + BATCH_SIZE); await webhook.send({ blocks: batch }); } - - console.log('✅ Report sent to Slack successfully'); + console.log('✅ Report sent to Slack via webhook (no thread_ts available)'); } catch (slackError) { - console.error('❌ Failed to send Slack notification:', slackError.message); + console.error('❌ Failed to send Slack notification via webhook:', slackError.message); } + return null; } function createSlackBlocks(summary, dateDisplay, workflowCount = 0, failedCount = 0) { @@ -602,6 +632,15 @@ function displayResults(summary, dateDisplay) { } } +function setGitHubOutput(name, value) { + const outputFile = process.env.GITHUB_OUTPUT; + if (outputFile) { + const delimiter = `ghadelimiter_${crypto.randomUUID?.() || Date.now()}`; + fs.appendFileSync(outputFile, `${name}<<${delimiter}\n${value}\n${delimiter}\n`); + } + console.log(`::set-output name=${name}::${value}`); +} + async function main() { const github = new Octokit({ auth: env.GITHUB_TOKEN }); @@ -641,7 +680,25 @@ async function main() { const summary = summarizeFailures(realFailures, flakyTests); displayResults(summary, dateRange.display); - await sendSlackReport(summary, dateRange.display, workflowRuns.length, failedRuns.length); + const threadTs = await sendSlackReport(summary, dateRange.display, workflowRuns.length, failedRuns.length); + + const top10 = summary.slice(0, 10); + const hasFailures = top10.length > 0; + const failuresJson = JSON.stringify(top10.map(test => ({ + name: test.name, + path: test.path, + realFailures: test.realFailures, + totalRetries: test.totalRetries, + lastError: test.lastRealFailureError || test.flakyFailureError || '', + jobId: test.lastRealFailureJobId || test.flakyFailureJobId, + runId: test.lastRealFailureRunId || test.flakyFailureRunId, + suite: test.suite, + isFlaky: test.realFailures === 0, + }))); + + setGitHubOutput('thread_ts', threadTs || ''); + setGitHubOutput('has_failures', hasFailures ? 'true' : 'false'); + setGitHubOutput('failures_json', failuresJson); } catch (error) { console.error('❌ Error:', error.message); diff --git a/.github/workflows/flaky-test-ai-analysis.yml b/.github/workflows/flaky-test-ai-analysis.yml new file mode 100644 index 00000000..39d4cb7a --- /dev/null +++ b/.github/workflows/flaky-test-ai-analysis.yml @@ -0,0 +1,70 @@ +name: Flaky Test AI Analysis (test) + +on: + workflow_dispatch: + inputs: + repository: + description: 'Target repository (e.g. metamask-extension)' + required: true + default: 'metamask-extension' + workflow-id: + description: 'Workflow ID to analyze (e.g. main.yml)' + required: true + default: 'main.yml' + dry-run: + description: 'Print analysis to stdout instead of posting to Slack' + type: boolean + default: true + +permissions: + contents: read + actions: read + +jobs: + flaky-report-and-analysis: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version-file: .nvmrc + cache: yarn + cache-dependency-path: yarn.lock + + - name: Enable Corepack + run: corepack enable + + - name: Install dependencies + run: yarn --immutable + + - name: Generate flaky test report + id: report + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }} + OWNER: MetaMask + REPOSITORY: ${{ inputs.repository }} + WORKFLOW_ID: ${{ inputs.workflow-id }} + BRANCH: main + run: node .github/scripts/create-flaky-test-report.mjs + + - name: AI analysis of flaky tests + if: steps.report.outputs.has_failures == 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + E2E_CLAUDE_API_KEY: ${{ secrets.E2E_CLAUDE_API_KEY }} + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }} + SLACK_THREAD_TS: ${{ steps.report.outputs.thread_ts }} + TARGET_OWNER: MetaMask + TARGET_REPO: ${{ inputs.repository }} + FAILURES_JSON: ${{ steps.report.outputs.failures_json }} + run: | + ARGS="" + if [ "${{ inputs.dry-run }}" = "true" ]; then + ARGS="--dry-run" + fi + yarn ts-node --swc .github/scripts/analyze-flaky-tests/index.ts $ARGS diff --git a/package.json b/package.json index d7527130..5fb3c40e 100644 --- a/package.json +++ b/package.json @@ -23,9 +23,11 @@ "slack:release-testing": "node .github/scripts/slack-release-testing.mjs", "test": "jest && jest-it-up", "test:watch": "jest --watch", + "analyze-flaky-tests": "ts-node --swc .github/scripts/analyze-flaky-tests/index.ts", "update-release-sheet": "node .github/scripts/update-release-sheet.mjs" }, "dependencies": { + "@anthropic-ai/sdk": "^0.39.0", "@metamask/auto-changelog": "^5.3.2", "@metamask/utils": "^7.1.0", "@octokit/graphql": "^7.0.1", diff --git a/yarn.lock b/yarn.lock index 5dfdb61a..cefe00b4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -21,6 +21,21 @@ __metadata: languageName: node linkType: hard +"@anthropic-ai/sdk@npm:^0.39.0": + version: 0.39.0 + resolution: "@anthropic-ai/sdk@npm:0.39.0" + dependencies: + "@types/node": "npm:^18.11.18" + "@types/node-fetch": "npm:^2.6.4" + abort-controller: "npm:^3.0.0" + agentkeepalive: "npm:^4.2.1" + form-data-encoder: "npm:1.7.2" + formdata-node: "npm:^4.3.2" + node-fetch: "npm:^2.6.7" + checksum: 10/8f1cb2d6a797ed095503ceec4271347ba9ee101c020fe3f5080c6853a5f3a9fc874649fcd0e3ae584c33e58548368cf3fb1da167221172859a1dff1e8c3419f6 + languageName: node + linkType: hard + "@babel/code-frame@npm:^7.0.0, @babel/code-frame@npm:^7.12.13, @babel/code-frame@npm:^7.18.6, @babel/code-frame@npm:^7.24.6": version: 7.24.6 resolution: "@babel/code-frame@npm:7.24.6" @@ -1012,6 +1027,7 @@ __metadata: version: 0.0.0-use.local resolution: "@metamask/github-tools@workspace:." dependencies: + "@anthropic-ai/sdk": "npm:^0.39.0" "@lavamoat/allow-scripts": "npm:^2.3.1" "@lavamoat/preinstall-always-fail": "npm:^1.0.0" "@metamask/auto-changelog": "npm:^5.3.2" @@ -1958,6 +1974,16 @@ __metadata: languageName: node linkType: hard +"@types/node-fetch@npm:^2.6.4": + version: 2.6.13 + resolution: "@types/node-fetch@npm:2.6.13" + dependencies: + "@types/node": "npm:*" + form-data: "npm:^4.0.4" + checksum: 10/944d52214791ebba482ca1393a4f0d62b0dbac5f7343ff42c128b75d5356d8bcefd4df77771b55c1acd19d118e16e9bd5d2792819c51bc13402d1c87c0975435 + languageName: node + linkType: hard + "@types/node@npm:*, @types/node@npm:>=12.0.0, @types/node@npm:>=18.0.0": version: 24.3.0 resolution: "@types/node@npm:24.3.0" @@ -1967,6 +1993,15 @@ __metadata: languageName: node linkType: hard +"@types/node@npm:^18.11.18": + version: 18.19.130 + resolution: "@types/node@npm:18.19.130" + dependencies: + undici-types: "npm:~5.26.4" + checksum: 10/ebb85c6edcec78df926de27d828ecbeb1b3d77c165ceef95bfc26e171edbc1924245db4eb2d7d6230206fe6b1a1f7665714fe1c70739e9f5980d8ce31af6ef82 + languageName: node + linkType: hard + "@types/node@npm:^20.3.2": version: 20.3.2 resolution: "@types/node@npm:20.3.2" @@ -2240,6 +2275,15 @@ __metadata: languageName: node linkType: hard +"abort-controller@npm:^3.0.0": + version: 3.0.0 + resolution: "abort-controller@npm:3.0.0" + dependencies: + event-target-shim: "npm:^5.0.0" + checksum: 10/ed84af329f1828327798229578b4fe03a4dd2596ba304083ebd2252666bdc1d7647d66d0b18704477e1f8aa315f055944aa6e859afebd341f12d0a53c37b4b40 + languageName: node + linkType: hard + "acorn-jsx@npm:^5.3.2": version: 5.3.2 resolution: "acorn-jsx@npm:5.3.2" @@ -4044,6 +4088,13 @@ __metadata: languageName: node linkType: hard +"event-target-shim@npm:^5.0.0": + version: 5.0.1 + resolution: "event-target-shim@npm:5.0.1" + checksum: 10/49ff46c3a7facbad3decb31f597063e761785d7fdb3920d4989d7b08c97a61c2f51183e2f3a03130c9088df88d4b489b1b79ab632219901f184f85158508f4c8 + languageName: node + linkType: hard + "eventemitter3@npm:^3.1.0": version: 3.1.2 resolution: "eventemitter3@npm:3.1.2" @@ -4357,6 +4408,13 @@ __metadata: languageName: node linkType: hard +"form-data-encoder@npm:1.7.2": + version: 1.7.2 + resolution: "form-data-encoder@npm:1.7.2" + checksum: 10/227bf2cea083284411fd67472ccc22f5cb354ca92c00690e11ff5ed942d993c13ac99dea365046306200f8bd71e1a7858d2d99e236de694b806b1f374a4ee341 + languageName: node + linkType: hard + "form-data@npm:^2.5.0": version: 2.5.2 resolution: "form-data@npm:2.5.2" @@ -4382,6 +4440,16 @@ __metadata: languageName: node linkType: hard +"formdata-node@npm:^4.3.2": + version: 4.4.1 + resolution: "formdata-node@npm:4.4.1" + dependencies: + node-domexception: "npm:1.0.0" + web-streams-polyfill: "npm:4.0.0-beta.3" + checksum: 10/29622f75533107c1bbcbe31fda683e6a55859af7f48ec354a9800591ce7947ed84cd3ef2b2fcb812047a884f17a1bac75ce098ffc17e23402cd373e49c1cd335 + languageName: node + linkType: hard + "fs-extra@npm:^11.2.0": version: 11.3.1 resolution: "fs-extra@npm:11.3.1" @@ -6414,6 +6482,13 @@ __metadata: languageName: node linkType: hard +"node-domexception@npm:1.0.0": + version: 1.0.0 + resolution: "node-domexception@npm:1.0.0" + checksum: 10/e332522f242348c511640c25a6fc7da4f30e09e580c70c6b13cb0be83c78c3e71c8d4665af2527e869fc96848924a4316ae7ec9014c091e2156f41739d4fa233 + languageName: node + linkType: hard + "node-fetch@npm:^2.6.7, node-fetch@npm:^2.6.9": version: 2.7.0 resolution: "node-fetch@npm:2.7.0" @@ -8245,6 +8320,13 @@ __metadata: languageName: node linkType: hard +"undici-types@npm:~5.26.4": + version: 5.26.5 + resolution: "undici-types@npm:5.26.5" + checksum: 10/0097779d94bc0fd26f0418b3a05472410408877279141ded2bd449167be1aed7ea5b76f756562cb3586a07f251b90799bab22d9019ceba49c037c76445f7cddd + languageName: node + linkType: hard + "undici-types@npm:~7.10.0": version: 7.10.0 resolution: "undici-types@npm:7.10.0" @@ -8386,6 +8468,13 @@ __metadata: languageName: node linkType: hard +"web-streams-polyfill@npm:4.0.0-beta.3": + version: 4.0.0-beta.3 + resolution: "web-streams-polyfill@npm:4.0.0-beta.3" + checksum: 10/dcdef67de57d83008f9dc330662b65ba4497315555dd0e4e7bcacb132ffdf8a830eaab8f74ad40a4a44f542461f51223f406e2a446ece1cc29927859b1405853 + languageName: node + linkType: hard + "webidl-conversions@npm:^3.0.0": version: 3.0.1 resolution: "webidl-conversions@npm:3.0.1"