Cradicle Explorer

/ src / components / binary-feedback / utils.ts
utils.ts
  1  import { TextBlock, ToolUseBlock } from '@anthropic-ai/sdk/resources/index.mjs'
  2  import { AssistantMessage, BinaryFeedbackResult } from '../../query.js'
  3  import { MAIN_QUERY_TEMPERATURE } from '../../services/claude.js'
  4  import { getDynamicConfig, logEvent } from '../../services/statsig.js'
  5  
  6  import { isEqual, zip } from 'lodash-es'
  7  import { getGitState } from '../../utils/git.js'
  8  
  9  export type BinaryFeedbackChoice =
 10    | 'prefer-left'
 11    | 'prefer-right'
 12    | 'neither'
 13    | 'no-preference'
 14  
 15  export type BinaryFeedbackChoose = (choice: BinaryFeedbackChoice) => void
 16  
 17  type BinaryFeedbackConfig = {
 18    sampleFrequency: number
 19  }
 20  
 21  async function getBinaryFeedbackStatsigConfig(): Promise<BinaryFeedbackConfig> {
 22    return await getDynamicConfig('tengu-binary-feedback-config', {
 23      sampleFrequency: 0,
 24    })
 25  }
 26  
 27  function getMessageBlockSequence(m: AssistantMessage) {
 28    return m.message.content.map(cb => {
 29      if (cb.type === 'text') return 'text'
 30      if (cb.type === 'tool_use') return cb.name
 31      return cb.type // Handle other block types like 'thinking' or 'redacted_thinking'
 32    })
 33  }
 34  
 35  export async function logBinaryFeedbackEvent(
 36    m1: AssistantMessage,
 37    m2: AssistantMessage,
 38    choice: BinaryFeedbackChoice,
 39  ): Promise<void> {
 40    const modelA = m1.message.model
 41    const modelB = m2.message.model
 42    const gitState = await getGitState()
 43    logEvent('tengu_binary_feedback', {
 44      msg_id_A: m1.message.id,
 45      msg_id_B: m2.message.id,
 46      choice: {
 47        'prefer-left': m1.message.id,
 48        'prefer-right': m2.message.id,
 49        neither: undefined,
 50        'no-preference': undefined,
 51      }[choice],
 52      choiceStr: choice,
 53      gitHead: gitState?.commitHash,
 54      gitBranch: gitState?.branchName,
 55      gitRepoRemoteUrl: gitState?.remoteUrl || undefined,
 56      gitRepoIsHeadOnRemote: gitState?.isHeadOnRemote?.toString(),
 57      gitRepoIsClean: gitState?.isClean?.toString(),
 58      modelA,
 59      modelB,
 60      temperatureA: String(MAIN_QUERY_TEMPERATURE),
 61      temperatureB: String(MAIN_QUERY_TEMPERATURE),
 62      seqA: String(getMessageBlockSequence(m1)),
 63      seqB: String(getMessageBlockSequence(m2)),
 64    })
 65  }
 66  
 67  export async function logBinaryFeedbackSamplingDecision(
 68    decision: boolean,
 69    reason?: string,
 70  ): Promise<void> {
 71    logEvent('tengu_binary_feedback_sampling_decision', {
 72      decision: decision.toString(),
 73      reason,
 74    })
 75  }
 76  
 77  export async function logBinaryFeedbackDisplayDecision(
 78    decision: boolean,
 79    m1: AssistantMessage,
 80    m2: AssistantMessage,
 81    reason?: string,
 82  ): Promise<void> {
 83    logEvent('tengu_binary_feedback_display_decision', {
 84      decision: decision.toString(),
 85      reason,
 86      msg_id_A: m1.message.id,
 87      msg_id_B: m2.message.id,
 88      seqA: String(getMessageBlockSequence(m1)),
 89      seqB: String(getMessageBlockSequence(m2)),
 90    })
 91  }
 92  
 93  function textContentBlocksEqual(cb1: TextBlock, cb2: TextBlock): boolean {
 94    return cb1.text === cb2.text
 95  }
 96  
 97  function contentBlocksEqual(
 98    cb1: TextBlock | ToolUseBlock,
 99    cb2: TextBlock | ToolUseBlock,
100  ): boolean {
101    if (cb1.type !== cb2.type) {
102      return false
103    }
104    if (cb1.type === 'text') {
105      return textContentBlocksEqual(cb1, cb2 as TextBlock)
106    }
107    cb2 = cb2 as ToolUseBlock
108    return cb1.name === cb2.name && isEqual(cb1.input, cb2.input)
109  }
110  
111  function allContentBlocksEqual(
112    content1: (TextBlock | ToolUseBlock)[],
113    content2: (TextBlock | ToolUseBlock)[],
114  ): boolean {
115    if (content1.length !== content2.length) {
116      return false
117    }
118    return zip(content1, content2).every(([cb1, cb2]) =>
119      contentBlocksEqual(cb1!, cb2!),
120    )
121  }
122  
123  export async function shouldUseBinaryFeedback(): Promise<boolean> {
124    if (process.env.DISABLE_BINARY_FEEDBACK) {
125      logBinaryFeedbackSamplingDecision(false, 'disabled_by_env_var')
126      return false
127    }
128    if (process.env.FORCE_BINARY_FEEDBACK) {
129      logBinaryFeedbackSamplingDecision(true, 'forced_by_env_var')
130      return true
131    }
132    if (process.env.USER_TYPE !== 'ant') {
133      logBinaryFeedbackSamplingDecision(false, 'not_ant')
134      return false
135    }
136    if (process.env.NODE_ENV === 'test') {
137      // Binary feedback breaks a couple tests related to checking for permission,
138      // so we have to disable it in tests at the risk of hiding bugs
139      logBinaryFeedbackSamplingDecision(false, 'test')
140      return false
141    }
142  
143    const config = await getBinaryFeedbackStatsigConfig()
144    if (config.sampleFrequency === 0) {
145      logBinaryFeedbackSamplingDecision(false, 'top_level_frequency_zero')
146      return false
147    }
148    if (Math.random() > config.sampleFrequency) {
149      logBinaryFeedbackSamplingDecision(false, 'top_level_frequency_rng')
150      return false
151    }
152    logBinaryFeedbackSamplingDecision(true)
153    return true
154  }
155  
156  export function messagePairValidForBinaryFeedback(
157    m1: AssistantMessage,
158    m2: AssistantMessage,
159  ): boolean {
160    const logPass = () => logBinaryFeedbackDisplayDecision(true, m1, m2)
161    const logFail = (reason: string) =>
162      logBinaryFeedbackDisplayDecision(false, m1, m2, reason)
163  
164    // Ignore thinking blocks, on the assumption that users don't find them very relevant
165    // compared to other content types
166    const nonThinkingBlocks1 = m1.message.content.filter(
167      b => b.type !== 'thinking' && b.type !== 'redacted_thinking',
168    )
169    const nonThinkingBlocks2 = m2.message.content.filter(
170      b => b.type !== 'thinking' && b.type !== 'redacted_thinking',
171    )
172    const hasToolUse =
173      nonThinkingBlocks1.some(b => b.type === 'tool_use') ||
174      nonThinkingBlocks2.some(b => b.type === 'tool_use')
175  
176    // If they're all text blocks, compare those
177    if (!hasToolUse) {
178      if (allContentBlocksEqual(nonThinkingBlocks1, nonThinkingBlocks2)) {
179        logFail('contents_identical')
180        return false
181      }
182      logPass()
183      return true
184    }
185  
186    // If there are tools, they're the most material difference between the messages.
187    // Only show binary feedback if there's a tool use difference, ignoring text.
188    if (
189      allContentBlocksEqual(
190        nonThinkingBlocks1.filter(b => b.type === 'tool_use'),
191        nonThinkingBlocks2.filter(b => b.type === 'tool_use'),
192      )
193    ) {
194      logFail('contents_identical')
195      return false
196    }
197  
198    logPass()
199    return true
200  }
201  
202  export function getBinaryFeedbackResultForChoice(
203    m1: AssistantMessage,
204    m2: AssistantMessage,
205    choice: BinaryFeedbackChoice,
206  ): BinaryFeedbackResult {
207    switch (choice) {
208      case 'prefer-left':
209        return { message: m1, shouldSkipPermissionCheck: true }
210      case 'prefer-right':
211        return { message: m2, shouldSkipPermissionCheck: true }
212      case 'no-preference':
213        return {
214          message: Math.random() < 0.5 ? m1 : m2,
215          shouldSkipPermissionCheck: false,
216        }
217      case 'neither':
218        return { message: null, shouldSkipPermissionCheck: false }
219    }
220  }