/ native-ts / file-index / index.ts
index.ts
  1  /**
  2   * Pure-TypeScript port of vendor/file-index-src (Rust NAPI module).
  3   *
  4   * The native module wraps nucleo (https://github.com/helix-editor/nucleo) for
  5   * high-performance fuzzy file searching. This port reimplements the same API
  6   * and scoring behavior without native dependencies.
  7   *
  8   * Key API:
  9   *   new FileIndex()
 10   *   .loadFromFileList(fileList: string[]): void   — dedupe + index paths
 11   *   .search(query: string, limit: number): SearchResult[]
 12   *
 13   * Score semantics: lower = better. Score is position-in-results / result-count,
 14   * so the best match is 0.0. Paths containing "test" get a 1.05× penalty (capped
 15   * at 1.0) so non-test files rank slightly higher.
 16   */
 17  
 18  export type SearchResult = {
 19    path: string
 20    score: number
 21  }
 22  
 23  // nucleo-style scoring constants (approximating fzf-v2 / nucleo bonuses)
 24  const SCORE_MATCH = 16
 25  const BONUS_BOUNDARY = 8
 26  const BONUS_CAMEL = 6
 27  const BONUS_CONSECUTIVE = 4
 28  const BONUS_FIRST_CHAR = 8
 29  const PENALTY_GAP_START = 3
 30  const PENALTY_GAP_EXTENSION = 1
 31  
 32  const TOP_LEVEL_CACHE_LIMIT = 100
 33  const MAX_QUERY_LEN = 64
 34  // Yield to event loop after this many ms of sync work. Chunk sizes are
 35  // time-based (not count-based) so slow machines get smaller chunks and
 36  // stay responsive — 5k paths is ~2ms on M-series but could be 15ms+ on
 37  // older Windows hardware.
 38  const CHUNK_MS = 4
 39  
 40  // Reusable buffer: records where each needle char matched during the indexOf scan
 41  const posBuf = new Int32Array(MAX_QUERY_LEN)
 42  
 43  export class FileIndex {
 44    private paths: string[] = []
 45    private lowerPaths: string[] = []
 46    private charBits: Int32Array = new Int32Array(0)
 47    private pathLens: Uint16Array = new Uint16Array(0)
 48    private topLevelCache: SearchResult[] | null = null
 49    // During async build, tracks how many paths have bitmap/lowerPath filled.
 50    // search() uses this to search the ready prefix while build continues.
 51    private readyCount = 0
 52  
 53    /**
 54     * Load paths from an array of strings.
 55     * This is the main way to populate the index — ripgrep collects files, we just search them.
 56     * Automatically deduplicates paths.
 57     */
 58    loadFromFileList(fileList: string[]): void {
 59      // Deduplicate and filter empty strings (matches Rust HashSet behavior)
 60      const seen = new Set<string>()
 61      const paths: string[] = []
 62      for (const line of fileList) {
 63        if (line.length > 0 && !seen.has(line)) {
 64          seen.add(line)
 65          paths.push(line)
 66        }
 67      }
 68  
 69      this.buildIndex(paths)
 70    }
 71  
 72    /**
 73     * Async variant: yields to the event loop every ~8–12k paths so large
 74     * indexes (270k+ files) don't block the main thread for >10ms at a time.
 75     * Identical result to loadFromFileList.
 76     *
 77     * Returns { queryable, done }:
 78     *   - queryable: resolves as soon as the first chunk is indexed (search
 79     *     returns partial results). For a 270k-path list this is ~5–10ms of
 80     *     sync work after the paths array is available.
 81     *   - done: resolves when the entire index is built.
 82     */
 83    loadFromFileListAsync(fileList: string[]): {
 84      queryable: Promise<void>
 85      done: Promise<void>
 86    } {
 87      let markQueryable: () => void = () => {}
 88      const queryable = new Promise<void>(resolve => {
 89        markQueryable = resolve
 90      })
 91      const done = this.buildAsync(fileList, markQueryable)
 92      return { queryable, done }
 93    }
 94  
 95    private async buildAsync(
 96      fileList: string[],
 97      markQueryable: () => void,
 98    ): Promise<void> {
 99      const seen = new Set<string>()
100      const paths: string[] = []
101      let chunkStart = performance.now()
102      for (let i = 0; i < fileList.length; i++) {
103        const line = fileList[i]!
104        if (line.length > 0 && !seen.has(line)) {
105          seen.add(line)
106          paths.push(line)
107        }
108        // Check every 256 iterations to amortize performance.now() overhead
109        if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) {
110          await yieldToEventLoop()
111          chunkStart = performance.now()
112        }
113      }
114  
115      this.resetArrays(paths)
116  
117      chunkStart = performance.now()
118      let firstChunk = true
119      for (let i = 0; i < paths.length; i++) {
120        this.indexPath(i)
121        if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) {
122          this.readyCount = i + 1
123          if (firstChunk) {
124            markQueryable()
125            firstChunk = false
126          }
127          await yieldToEventLoop()
128          chunkStart = performance.now()
129        }
130      }
131      this.readyCount = paths.length
132      markQueryable()
133    }
134  
135    private buildIndex(paths: string[]): void {
136      this.resetArrays(paths)
137      for (let i = 0; i < paths.length; i++) {
138        this.indexPath(i)
139      }
140      this.readyCount = paths.length
141    }
142  
143    private resetArrays(paths: string[]): void {
144      const n = paths.length
145      this.paths = paths
146      this.lowerPaths = new Array(n)
147      this.charBits = new Int32Array(n)
148      this.pathLens = new Uint16Array(n)
149      this.readyCount = 0
150      this.topLevelCache = computeTopLevelEntries(paths, TOP_LEVEL_CACHE_LIMIT)
151    }
152  
153    // Precompute: lowercase, a–z bitmap, length. Bitmap gives O(1) rejection
154    // of paths missing any needle letter (89% survival for broad queries like
155    // "test" → still a 10%+ free win; 90%+ rejection for rare chars).
156    private indexPath(i: number): void {
157      const lp = this.paths[i]!.toLowerCase()
158      this.lowerPaths[i] = lp
159      const len = lp.length
160      this.pathLens[i] = len
161      let bits = 0
162      for (let j = 0; j < len; j++) {
163        const c = lp.charCodeAt(j)
164        if (c >= 97 && c <= 122) bits |= 1 << (c - 97)
165      }
166      this.charBits[i] = bits
167    }
168  
169    /**
170     * Search for files matching the query using fuzzy matching.
171     * Returns top N results sorted by match score.
172     */
173    search(query: string, limit: number): SearchResult[] {
174      if (limit <= 0) return []
175      if (query.length === 0) {
176        if (this.topLevelCache) {
177          return this.topLevelCache.slice(0, limit)
178        }
179        return []
180      }
181  
182      // Smart case: lowercase query → case-insensitive; any uppercase → case-sensitive
183      const caseSensitive = query !== query.toLowerCase()
184      const needle = caseSensitive ? query : query.toLowerCase()
185      const nLen = Math.min(needle.length, MAX_QUERY_LEN)
186      const needleChars: string[] = new Array(nLen)
187      let needleBitmap = 0
188      for (let j = 0; j < nLen; j++) {
189        const ch = needle.charAt(j)
190        needleChars[j] = ch
191        const cc = ch.charCodeAt(0)
192        if (cc >= 97 && cc <= 122) needleBitmap |= 1 << (cc - 97)
193      }
194  
195      // Upper bound on score assuming every match gets the max boundary bonus.
196      // Used to reject paths whose gap penalties alone make them unable to beat
197      // the current top-k threshold, before the charCodeAt-heavy boundary pass.
198      const scoreCeiling =
199        nLen * (SCORE_MATCH + BONUS_BOUNDARY) + BONUS_FIRST_CHAR + 32
200  
201      // Top-k: maintain a sorted-ascending array of the best `limit` matches.
202      // Avoids O(n log n) sort of all matches when we only need `limit` of them.
203      const topK: { path: string; fuzzScore: number }[] = []
204      let threshold = -Infinity
205  
206      const { paths, lowerPaths, charBits, pathLens, readyCount } = this
207  
208      outer: for (let i = 0; i < readyCount; i++) {
209        // O(1) bitmap reject: path must contain every letter in the needle
210        if ((charBits[i]! & needleBitmap) !== needleBitmap) continue
211  
212        const haystack = caseSensitive ? paths[i]! : lowerPaths[i]!
213  
214        // Fused indexOf scan: find positions (SIMD-accelerated in JSC/V8) AND
215        // accumulate gap/consecutive terms inline. The greedy-earliest positions
216        // found here are identical to what the charCodeAt scorer would find, so
217        // we score directly from them — no second scan.
218        let pos = haystack.indexOf(needleChars[0]!)
219        if (pos === -1) continue
220        posBuf[0] = pos
221        let gapPenalty = 0
222        let consecBonus = 0
223        let prev = pos
224        for (let j = 1; j < nLen; j++) {
225          pos = haystack.indexOf(needleChars[j]!, prev + 1)
226          if (pos === -1) continue outer
227          posBuf[j] = pos
228          const gap = pos - prev - 1
229          if (gap === 0) consecBonus += BONUS_CONSECUTIVE
230          else gapPenalty += PENALTY_GAP_START + gap * PENALTY_GAP_EXTENSION
231          prev = pos
232        }
233  
234        // Gap-bound reject: if the best-case score (all boundary bonuses) minus
235        // known gap penalties can't beat threshold, skip the boundary pass.
236        if (
237          topK.length === limit &&
238          scoreCeiling + consecBonus - gapPenalty <= threshold
239        ) {
240          continue
241        }
242  
243        // Boundary/camelCase scoring: check the char before each match position.
244        const path = paths[i]!
245        const hLen = pathLens[i]!
246        let score = nLen * SCORE_MATCH + consecBonus - gapPenalty
247        score += scoreBonusAt(path, posBuf[0]!, true)
248        for (let j = 1; j < nLen; j++) {
249          score += scoreBonusAt(path, posBuf[j]!, false)
250        }
251        score += Math.max(0, 32 - (hLen >> 2))
252  
253        if (topK.length < limit) {
254          topK.push({ path, fuzzScore: score })
255          if (topK.length === limit) {
256            topK.sort((a, b) => a.fuzzScore - b.fuzzScore)
257            threshold = topK[0]!.fuzzScore
258          }
259        } else if (score > threshold) {
260          let lo = 0
261          let hi = topK.length
262          while (lo < hi) {
263            const mid = (lo + hi) >> 1
264            if (topK[mid]!.fuzzScore < score) lo = mid + 1
265            else hi = mid
266          }
267          topK.splice(lo, 0, { path, fuzzScore: score })
268          topK.shift()
269          threshold = topK[0]!.fuzzScore
270        }
271      }
272  
273      // topK is ascending; reverse to descending (best first)
274      topK.sort((a, b) => b.fuzzScore - a.fuzzScore)
275  
276      const matchCount = topK.length
277      const denom = Math.max(matchCount, 1)
278      const results: SearchResult[] = new Array(matchCount)
279  
280      for (let i = 0; i < matchCount; i++) {
281        const path = topK[i]!.path
282        const positionScore = i / denom
283        const finalScore = path.includes('test')
284          ? Math.min(positionScore * 1.05, 1.0)
285          : positionScore
286        results[i] = { path, score: finalScore }
287      }
288  
289      return results
290    }
291  }
292  
293  /**
294   * Boundary/camelCase bonus for a match at position `pos` in the original-case
295   * path. `first` enables the start-of-string bonus (only for needle[0]).
296   */
297  function scoreBonusAt(path: string, pos: number, first: boolean): number {
298    if (pos === 0) return first ? BONUS_FIRST_CHAR : 0
299    const prevCh = path.charCodeAt(pos - 1)
300    if (isBoundary(prevCh)) return BONUS_BOUNDARY
301    if (isLower(prevCh) && isUpper(path.charCodeAt(pos))) return BONUS_CAMEL
302    return 0
303  }
304  
305  function isBoundary(code: number): boolean {
306    // / \ - _ . space
307    return (
308      code === 47 || // /
309      code === 92 || // \
310      code === 45 || // -
311      code === 95 || // _
312      code === 46 || // .
313      code === 32 // space
314    )
315  }
316  
317  function isLower(code: number): boolean {
318    return code >= 97 && code <= 122
319  }
320  
321  function isUpper(code: number): boolean {
322    return code >= 65 && code <= 90
323  }
324  
325  export function yieldToEventLoop(): Promise<void> {
326    return new Promise(resolve => setImmediate(resolve))
327  }
328  
329  export { CHUNK_MS }
330  
331  /**
332   * Extract unique top-level path segments, sorted by (length asc, then alpha asc).
333   * Handles both Unix (/) and Windows (\) path separators.
334   * Mirrors FileIndex::compute_top_level_entries in lib.rs.
335   */
336  function computeTopLevelEntries(
337    paths: string[],
338    limit: number,
339  ): SearchResult[] {
340    const topLevel = new Set<string>()
341  
342    for (const p of paths) {
343      // Split on first / or \ separator
344      let end = p.length
345      for (let i = 0; i < p.length; i++) {
346        const c = p.charCodeAt(i)
347        if (c === 47 || c === 92) {
348          end = i
349          break
350        }
351      }
352      const segment = p.slice(0, end)
353      if (segment.length > 0) {
354        topLevel.add(segment)
355        if (topLevel.size >= limit) break
356      }
357    }
358  
359    const sorted = Array.from(topLevel)
360    sorted.sort((a, b) => {
361      const lenDiff = a.length - b.length
362      if (lenDiff !== 0) return lenDiff
363      return a < b ? -1 : a > b ? 1 : 0
364    })
365  
366    return sorted.slice(0, limit).map(path => ({ path, score: 0.0 }))
367  }
368  
369  export default FileIndex
370  export type { FileIndex as FileIndexType }