utils.ts
1 import { TextBlock, ToolUseBlock } from '@anthropic-ai/sdk/resources/index.mjs' 2 import { AssistantMessage, BinaryFeedbackResult } from '../../query.js' 3 import { MAIN_QUERY_TEMPERATURE } from '../../services/claude.js' 4 import { getDynamicConfig, logEvent } from '../../services/statsig.js' 5 6 import { isEqual, zip } from 'lodash-es' 7 import { getGitState } from '../../utils/git.js' 8 9 export type BinaryFeedbackChoice = 10 | 'prefer-left' 11 | 'prefer-right' 12 | 'neither' 13 | 'no-preference' 14 15 export type BinaryFeedbackChoose = (choice: BinaryFeedbackChoice) => void 16 17 type BinaryFeedbackConfig = { 18 sampleFrequency: number 19 } 20 21 async function getBinaryFeedbackStatsigConfig(): Promise<BinaryFeedbackConfig> { 22 return await getDynamicConfig('tengu-binary-feedback-config', { 23 sampleFrequency: 0, 24 }) 25 } 26 27 function getMessageBlockSequence(m: AssistantMessage) { 28 return m.message.content.map(cb => { 29 if (cb.type === 'text') return 'text' 30 if (cb.type === 'tool_use') return cb.name 31 return cb.type // Handle other block types like 'thinking' or 'redacted_thinking' 32 }) 33 } 34 35 export async function logBinaryFeedbackEvent( 36 m1: AssistantMessage, 37 m2: AssistantMessage, 38 choice: BinaryFeedbackChoice, 39 ): Promise<void> { 40 const modelA = m1.message.model 41 const modelB = m2.message.model 42 const gitState = await getGitState() 43 logEvent('tengu_binary_feedback', { 44 msg_id_A: m1.message.id, 45 msg_id_B: m2.message.id, 46 choice: { 47 'prefer-left': m1.message.id, 48 'prefer-right': m2.message.id, 49 neither: undefined, 50 'no-preference': undefined, 51 }[choice], 52 choiceStr: choice, 53 gitHead: gitState?.commitHash, 54 gitBranch: gitState?.branchName, 55 gitRepoRemoteUrl: gitState?.remoteUrl || undefined, 56 gitRepoIsHeadOnRemote: gitState?.isHeadOnRemote?.toString(), 57 gitRepoIsClean: gitState?.isClean?.toString(), 58 modelA, 59 modelB, 60 temperatureA: String(MAIN_QUERY_TEMPERATURE), 61 temperatureB: String(MAIN_QUERY_TEMPERATURE), 62 seqA: String(getMessageBlockSequence(m1)), 63 seqB: String(getMessageBlockSequence(m2)), 64 }) 65 } 66 67 export async function logBinaryFeedbackSamplingDecision( 68 decision: boolean, 69 reason?: string, 70 ): Promise<void> { 71 logEvent('tengu_binary_feedback_sampling_decision', { 72 decision: decision.toString(), 73 reason, 74 }) 75 } 76 77 export async function logBinaryFeedbackDisplayDecision( 78 decision: boolean, 79 m1: AssistantMessage, 80 m2: AssistantMessage, 81 reason?: string, 82 ): Promise<void> { 83 logEvent('tengu_binary_feedback_display_decision', { 84 decision: decision.toString(), 85 reason, 86 msg_id_A: m1.message.id, 87 msg_id_B: m2.message.id, 88 seqA: String(getMessageBlockSequence(m1)), 89 seqB: String(getMessageBlockSequence(m2)), 90 }) 91 } 92 93 function textContentBlocksEqual(cb1: TextBlock, cb2: TextBlock): boolean { 94 return cb1.text === cb2.text 95 } 96 97 function contentBlocksEqual( 98 cb1: TextBlock | ToolUseBlock, 99 cb2: TextBlock | ToolUseBlock, 100 ): boolean { 101 if (cb1.type !== cb2.type) { 102 return false 103 } 104 if (cb1.type === 'text') { 105 return textContentBlocksEqual(cb1, cb2 as TextBlock) 106 } 107 cb2 = cb2 as ToolUseBlock 108 return cb1.name === cb2.name && isEqual(cb1.input, cb2.input) 109 } 110 111 function allContentBlocksEqual( 112 content1: (TextBlock | ToolUseBlock)[], 113 content2: (TextBlock | ToolUseBlock)[], 114 ): boolean { 115 if (content1.length !== content2.length) { 116 return false 117 } 118 return zip(content1, content2).every(([cb1, cb2]) => 119 contentBlocksEqual(cb1!, cb2!), 120 ) 121 } 122 123 export async function shouldUseBinaryFeedback(): Promise<boolean> { 124 if (process.env.DISABLE_BINARY_FEEDBACK) { 125 logBinaryFeedbackSamplingDecision(false, 'disabled_by_env_var') 126 return false 127 } 128 if (process.env.FORCE_BINARY_FEEDBACK) { 129 logBinaryFeedbackSamplingDecision(true, 'forced_by_env_var') 130 return true 131 } 132 if (process.env.USER_TYPE !== 'ant') { 133 logBinaryFeedbackSamplingDecision(false, 'not_ant') 134 return false 135 } 136 if (process.env.NODE_ENV === 'test') { 137 // Binary feedback breaks a couple tests related to checking for permission, 138 // so we have to disable it in tests at the risk of hiding bugs 139 logBinaryFeedbackSamplingDecision(false, 'test') 140 return false 141 } 142 143 const config = await getBinaryFeedbackStatsigConfig() 144 if (config.sampleFrequency === 0) { 145 logBinaryFeedbackSamplingDecision(false, 'top_level_frequency_zero') 146 return false 147 } 148 if (Math.random() > config.sampleFrequency) { 149 logBinaryFeedbackSamplingDecision(false, 'top_level_frequency_rng') 150 return false 151 } 152 logBinaryFeedbackSamplingDecision(true) 153 return true 154 } 155 156 export function messagePairValidForBinaryFeedback( 157 m1: AssistantMessage, 158 m2: AssistantMessage, 159 ): boolean { 160 const logPass = () => logBinaryFeedbackDisplayDecision(true, m1, m2) 161 const logFail = (reason: string) => 162 logBinaryFeedbackDisplayDecision(false, m1, m2, reason) 163 164 // Ignore thinking blocks, on the assumption that users don't find them very relevant 165 // compared to other content types 166 const nonThinkingBlocks1 = m1.message.content.filter( 167 b => b.type !== 'thinking' && b.type !== 'redacted_thinking', 168 ) 169 const nonThinkingBlocks2 = m2.message.content.filter( 170 b => b.type !== 'thinking' && b.type !== 'redacted_thinking', 171 ) 172 const hasToolUse = 173 nonThinkingBlocks1.some(b => b.type === 'tool_use') || 174 nonThinkingBlocks2.some(b => b.type === 'tool_use') 175 176 // If they're all text blocks, compare those 177 if (!hasToolUse) { 178 if (allContentBlocksEqual(nonThinkingBlocks1, nonThinkingBlocks2)) { 179 logFail('contents_identical') 180 return false 181 } 182 logPass() 183 return true 184 } 185 186 // If there are tools, they're the most material difference between the messages. 187 // Only show binary feedback if there's a tool use difference, ignoring text. 188 if ( 189 allContentBlocksEqual( 190 nonThinkingBlocks1.filter(b => b.type === 'tool_use'), 191 nonThinkingBlocks2.filter(b => b.type === 'tool_use'), 192 ) 193 ) { 194 logFail('contents_identical') 195 return false 196 } 197 198 logPass() 199 return true 200 } 201 202 export function getBinaryFeedbackResultForChoice( 203 m1: AssistantMessage, 204 m2: AssistantMessage, 205 choice: BinaryFeedbackChoice, 206 ): BinaryFeedbackResult { 207 switch (choice) { 208 case 'prefer-left': 209 return { message: m1, shouldSkipPermissionCheck: true } 210 case 'prefer-right': 211 return { message: m2, shouldSkipPermissionCheck: true } 212 case 'no-preference': 213 return { 214 message: Math.random() < 0.5 ? m1 : m2, 215 shouldSkipPermissionCheck: false, 216 } 217 case 'neither': 218 return { message: null, shouldSkipPermissionCheck: false } 219 } 220 }