subtitle.go
1 package util 2 3 import ( 4 "bufio" 5 "fmt" 6 "krillin-ai/internal/storage" 7 "krillin-ai/internal/types" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "regexp" 12 "strconv" 13 "strings" 14 "unicode" 15 ) 16 17 // 处理每一个字幕块 18 func ProcessBlock(block []string, targetLanguageFile, targetLanguageTextFile, originLanguageFile, originLanguageTextFile *os.File, isTargetOnTop bool) { 19 var targetLines, originLines []string 20 // 匹配时间戳的正则表达式 21 timePattern := regexp.MustCompile(`\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}`) 22 for _, line := range block { 23 if timePattern.MatchString(line) || IsNumber(line) { 24 // 时间戳和编号行保留在两个文件中 25 targetLines = append(targetLines, line) 26 originLines = append(originLines, line) 27 continue 28 } 29 if len(targetLines) == 2 && len(originLines) == 2 { // 刚写完编号和时间戳,到了上方的文字行 30 if isTargetOnTop { 31 targetLines = append(targetLines, line) 32 targetLanguageTextFile.WriteString(line + " ") // 文稿文件 33 } else { 34 originLines = append(originLines, line) 35 originLanguageTextFile.WriteString(line + " ") 36 } 37 continue 38 } 39 // 到了下方的文字行 40 if isTargetOnTop { 41 originLines = append(originLines, line) 42 originLanguageTextFile.WriteString(line + " ") 43 } else { 44 targetLines = append(targetLines, line) 45 targetLanguageTextFile.WriteString(line + " ") 46 } 47 } 48 49 if len(targetLines) > 2 { 50 // 写入目标语言文件 51 for _, line := range targetLines { 52 targetLanguageFile.WriteString(line + "\n") 53 } 54 targetLanguageFile.WriteString("\n") 55 } 56 57 if len(originLines) > 2 { 58 // 写入源语言文件 59 for _, line := range originLines { 60 originLanguageFile.WriteString(line + "\n") 61 } 62 originLanguageFile.WriteString("\n") 63 } 64 } 65 66 // IsSubtitleText 是否是字幕文件中的字幕文字行 67 func IsSubtitleText(line string) bool { 68 if line == "" { 69 return false 70 } 71 if IsNumber(line) { 72 return false 73 } 74 timelinePattern := regexp.MustCompile(`\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}`) 75 return !timelinePattern.MatchString(line) 76 } 77 78 type Format struct { 79 Duration string `json:"duration"` 80 } 81 82 type ProbeData struct { 83 Format Format `json:"format"` 84 } 85 86 type SrtBlock struct { 87 Index int 88 Timestamp string 89 TargetLanguageSentence string 90 OriginLanguageSentence string 91 } 92 93 func TrimString(s string) string { 94 s = strings.Replace(s, "[中文翻译]", "", -1) 95 s = strings.Replace(s, "[英文句子]", "", -1) 96 // 去除开头的空格和 '[' 97 s = strings.TrimLeft(s, " [") 98 99 // 去除结尾的空格和 ']' 100 s = strings.TrimRight(s, " ]") 101 102 //替换中文单引号 103 s = strings.ReplaceAll(s, "’", "'") 104 105 return s 106 } 107 108 func SplitSentence(sentence string) []string { 109 // 使用正则表达式移除标点符号和特殊字符(保留各语言字母、数字和空格) 110 re := regexp.MustCompile(`[^\p{L}\p{N}\s']+`) 111 cleanedSentence := re.ReplaceAllString(sentence, " ") 112 113 // 使用 strings.Fields 按空格拆分成单词 114 words := strings.Fields(cleanedSentence) 115 116 return words 117 } 118 119 func MergeFile(finalFile string, files ...string) error { 120 // 创建最终文件 121 final, err := os.Create(finalFile) 122 if err != nil { 123 return err 124 } 125 126 // 逐个读取文件并写入最终文件 127 for _, file := range files { 128 f, err := os.Open(file) 129 if err != nil { 130 return err 131 } 132 defer f.Close() 133 134 scanner := bufio.NewScanner(f) 135 for scanner.Scan() { 136 line := scanner.Text() 137 final.WriteString(line + "\n") 138 } 139 } 140 141 return nil 142 } 143 144 func MergeSrtFiles(finalFile string, files ...string) error { 145 output, err := os.Create(finalFile) 146 if err != nil { 147 return err 148 } 149 defer output.Close() 150 writer := bufio.NewWriter(output) 151 lineNumber := 0 152 for _, file := range files { 153 // 不存在某一个file就跳过 154 if _, err = os.Stat(file); os.IsNotExist(err) { 155 continue 156 } 157 // 打开当前字幕文件 158 f, err := os.Open(file) 159 if err != nil { 160 return err 161 } 162 defer f.Close() 163 // 处理当前字幕文件 164 scanner := bufio.NewScanner(f) 165 for scanner.Scan() { 166 line := scanner.Text() 167 168 if strings.Contains(line, "```") { 169 continue 170 } 171 172 if IsNumber(line) { 173 lineNumber++ 174 line = strconv.Itoa(lineNumber) 175 } 176 177 writer.WriteString(line + "\n") 178 } 179 } 180 writer.Flush() 181 182 return nil 183 } 184 185 // 给定文件和替换map,将文件中所有的key替换成value 186 func ReplaceFileContent(srcFile, dstFile string, replacements map[string]string) error { 187 file, err := os.Open(srcFile) 188 if err != nil { 189 return err 190 } 191 defer file.Close() 192 193 outFile, err := os.Create(dstFile) 194 if err != nil { 195 return err 196 } 197 defer outFile.Close() 198 199 scanner := bufio.NewScanner(file) 200 writer := bufio.NewWriter(outFile) // 提高性能 201 defer writer.Flush() 202 203 for scanner.Scan() { 204 line := scanner.Text() 205 for before, after := range replacements { 206 line = strings.ReplaceAll(line, before, after) 207 } 208 _, _ = writer.WriteString(line + "\n") 209 } 210 211 if err = scanner.Err(); err != nil { 212 return err 213 } 214 215 return nil 216 } 217 218 // 获得文件名后加上后缀的新文件名,不改变扩展名,例如:/home/ubuntu/abc.srt变成/home/ubuntu/abc_tmp.srt 219 func AddSuffixToFileName(filePath, suffix string) string { 220 dir := filepath.Dir(filePath) 221 ext := filepath.Ext(filePath) 222 name := strings.TrimSuffix(filepath.Base(filePath), ext) 223 newName := fmt.Sprintf("%s%s%s", name, suffix, ext) 224 return filepath.Join(dir, newName) 225 } 226 227 // 去除字符串中的标点符号等字符,确保字符中的内容都是whisper模型可以识别出来的,便于时间戳对齐 228 func GetRecognizableString(s string) string { 229 var result []rune 230 for _, v := range s { 231 // 英文字母和数字 232 if unicode.Is(unicode.Latin, v) || unicode.Is(unicode.Number, v) { 233 result = append(result, v) 234 } 235 // 中文 236 if unicode.Is(unicode.Han, v) { 237 result = append(result, v) 238 } 239 // 韩文 240 if unicode.Is(unicode.Hangul, v) { 241 result = append(result, v) 242 } 243 // 日文平假片假 244 if unicode.Is(unicode.Hiragana, v) || unicode.Is(unicode.Katakana, v) { 245 result = append(result, v) 246 } 247 } 248 return string(result) 249 } 250 251 func GetAudioDuration(inputFile string) (float64, error) { 252 // 使用 ffprobe 获取精确时长 253 cmd := exec.Command(storage.FfprobePath, "-i", inputFile, "-show_entries", "format=duration", "-v", "quiet", "-of", "csv=p=0") 254 cmdOutput, err := cmd.Output() 255 if err != nil { 256 return 0, fmt.Errorf("GetAudioDuration failed to get audio duration: %w", err) 257 } 258 259 // 解析时长 260 duration, err := strconv.ParseFloat(strings.TrimSpace(string(cmdOutput)), 64) 261 if err != nil { 262 return 0, fmt.Errorf("GetAudioDuration failed to parse audio duration: %w", err) 263 } 264 265 return duration, nil 266 } 267 268 // todo 后续再补充 269 func IsAsianLanguage(code types.StandardLanguageCode) bool { 270 return code == types.LanguageNameSimplifiedChinese || code == types.LanguageNameTraditionalChinese || code == types.LanguageNameJapanese || code == types.LanguageNameKorean || code == types.LanguageNameThai 271 } 272 273 func BeautifyAsianLanguageSentence(input string) string { 274 if len(input) == 0 { 275 return input 276 } 277 278 // 不处理的 279 pairPunctuations := map[rune]rune{ 280 '「': '」', '『': '』', '“': '”', '‘': '’', 281 '《': '》', '<': '>', '【': '】', '〔': '〕', 282 '(': ')', '[': ']', '{': '}', 283 } 284 285 // 需要处理的单标点 286 singlePunctuations := ",.;:!?~,、。!?;:…" 287 288 // 先处理字符串末尾的标点 289 runes := []rune(input) 290 i := len(runes) - 1 291 for i >= 0 { 292 r := runes[i] 293 // 如果是空格,继续检查前一个字符 294 if unicode.IsSpace(r) { 295 i-- 296 continue 297 } 298 // 如果是单标点,去除 299 if strings.ContainsRune(singlePunctuations, r) { 300 runes = runes[:i] 301 i-- 302 } else { 303 // 遇到非标点或成对标点,停止 304 break 305 } 306 } 307 308 // 中间的单标点替换为空格 309 var inPair bool 310 var expectedClose rune 311 var result []rune 312 313 for i := 0; i < len(runes); i++ { 314 r := runes[i] 315 316 // 检查是否在成对标点内 317 if inPair { 318 if r == expectedClose { 319 inPair = false 320 } 321 result = append(result, r) 322 continue 323 } 324 325 // 检查是否是成对标点的开始 326 if close, isPair := pairPunctuations[r]; isPair { 327 inPair = true 328 expectedClose = close 329 result = append(result, r) 330 continue 331 } 332 333 // 检查是否是数字中的小数点 334 if r == '.' && i > 0 && i < len(runes)-1 { 335 prev := runes[i-1] 336 next := runes[i+1] 337 if unicode.IsDigit(prev) && unicode.IsDigit(next) { 338 result = append(result, r) 339 continue 340 } 341 } 342 343 // 处理单标点 344 if strings.ContainsRune(singlePunctuations, r) { 345 // 替换为空格,但避免连续空格 346 if len(result) > 0 && !unicode.IsSpace(result[len(result)-1]) { 347 result = append(result, ' ') 348 } 349 } else { 350 result = append(result, r) 351 } 352 } 353 354 return strings.TrimSpace(string(result)) 355 } 356 357 // SplitTextSentences 将文本按常见的半全角分隔符号切分成句子,会考虑一些特殊的不用切分的情况 358 // maxChars: 最小字符数,完整句子小于此字符数时不切割,否则连逗号也要切割 359 // 使用示例: 360 // 361 // SplitTextSentences("你好,世界!", 5) // 返回: ["你好,世界!"] (不切割,因为总字符数<5) 362 // SplitTextSentences("这是一个很长的句子,包含很多内容。", 10) // 返回: ["这是一个很长的句子", "包含很多内容。"] (切割逗号) 363 func SplitTextSentences(text string, maxChars int) []string { 364 if strings.TrimSpace(text) == "" { 365 return []string{} 366 } 367 368 // 第一步:保护特殊模式(数字、时间、缩写等) 369 text = protectSpecialNumbers(text) 370 371 // 第二步:智能切割 - 首先按完整句子分割 372 completeSentences := splitByCompleteSentences(text) 373 374 var result []string 375 for _, sentence := range completeSentences { 376 sentence = strings.TrimSpace(sentence) 377 if sentence == "" { 378 continue 379 } 380 381 // 统计有效字符数(排除标点和空格) 382 effectiveChars := CountEffectiveChars(sentence) 383 384 // 如果完整句子小于最小字符数,不切割 385 if effectiveChars < maxChars { 386 cleaned := restoreProtectedPatterns(sentence) 387 result = append(result, strings.TrimSpace(cleaned)) 388 } else { 389 // 完整句子过长,需要进一步按逗号等标点切割 390 subSentences := splitByAllPunctuation(sentence) 391 merged := mergeShortSentences(subSentences, 20, maxChars) 392 393 for _, subSentence := range merged { 394 cleaned := restoreProtectedPatterns(subSentence) 395 cleaned = strings.TrimSpace(cleaned) 396 if cleaned != "" { 397 result = append(result, cleaned) 398 } 399 } 400 } 401 } 402 403 return result 404 } 405 406 // protectedPatterns 存储被保护的模式 407 var protectedPatterns map[string]string 408 409 // protectSpecialNumbers 保护数字、时间、缩写等不被误切 410 func protectSpecialNumbers(text string) string { 411 protectedPatterns = make(map[string]string) 412 413 // 使用更直接的方法来保护列表编号模式 414 // 先处理特定的模式,如 "1.value", "2.be", "3.give" 等 415 listNumberPattern := regexp.MustCompile(`\b\d+\.[a-zA-Z]`) 416 text = listNumberPattern.ReplaceAllStringFunc(text, func(match string) string { 417 placeholder := fmt.Sprintf("\uE000%d\uE000", len(protectedPatterns)) 418 protectedPatterns[placeholder] = match 419 return placeholder 420 }) 421 422 patterns := []struct { 423 regex *regexp.Regexp 424 name string 425 }{ 426 // 保护域名和网址(如 .com, .org, .net 等) 427 {regexp.MustCompile(`\b[a-zA-Z0-9-]+\.(?:com|org|net|edu|gov|mil|int|co|io|ai|me|tv|fm|am|pm|uk|cn|jp|de|fr|it|es|ru|in|au|ca|br|mx|ar|cl|pe|ve|ec|py|uy|bo|gf|sr|gy|fk|gs|sh|ac|ad|ae|af|ag|al|am|an|ao|aq|as|at|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cc|cd|cf|cg|ch|ci|ck|cm|co|cr|cs|cu|cv|cx|cy|cz|dj|dk|dm|do|dz|eg|eh|er|et|eu|fi|fj|fk|fo|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|iq|ir|is|je|jm|jo|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|qa|re|ro|rs|rw|sa|sb|sc|sd|se|sg|si|sj|sk|sl|sm|sn|so|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tz|ua|ug|um|us|uy|uz|va|vc|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)\b`), "domain"}, 428 // 保护 a.m., p.m., A.M., P.M. 这类缩写 429 {regexp.MustCompile(`(?i)\b[ap]\.m\.`), "ampm"}, 430 // 时间格式 431 {regexp.MustCompile(`\b\d{1,2}[:\.]\d{2}\s*(?:[ap]\.?m\.?|AM|PM)?\b`), "time"}, 432 // 小数(包括多位小数) 433 {regexp.MustCompile(`\b\d+\.\d+\b`), "decimal"}, 434 // 千位分隔符 435 {regexp.MustCompile(`\b\d{1,3}(?:,\d{3})+(?:\.\d+)?\b`), "thousands"}, 436 // 版本号(如 1.0, 2.5.1 等) 437 {regexp.MustCompile(`\b\d+(?:\.\d+)+\b`), "version"}, 438 // 英文缩写 439 {regexp.MustCompile(`\b(?:[A-Z][a-z]*\.){2,}|(?:[A-Z]\.){2,}[A-Z]?\b`), "abbrev"}, 440 // Mr., Mrs., Dr. 等称谓 441 {regexp.MustCompile(`\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr)\.`), "title"}, 442 // 列表编号(如 1., 2., 3. 等)- 数字+点+空格 443 {regexp.MustCompile(`\b\d+\.\s`), "list_number_with_space"}, 444 // 字母编号(如 a., b., c. 等) 445 {regexp.MustCompile(`\b[a-zA-Z]\.\s`), "letter_number_with_space"}, 446 } 447 448 for _, pattern := range patterns { 449 text = pattern.regex.ReplaceAllStringFunc(text, func(match string) string { 450 placeholder := fmt.Sprintf("\uE000%d\uE000", len(protectedPatterns)) 451 protectedPatterns[placeholder] = match 452 return placeholder 453 }) 454 } 455 456 return text 457 } 458 459 // splitByCompleteSentences 按完整句子标点分割(句号、感叹号、问号等) 460 func splitByCompleteSentences(text string) []string { 461 // 只按句末标点分割,不包含逗号 462 completeSentenceMarkers := []string{ 463 ".", "!", "?", "。", "!", "?", ";", "\n", "\r\n", 464 } 465 466 // 创建正则表达式模式 467 var patterns []string 468 for _, marker := range completeSentenceMarkers { 469 patterns = append(patterns, regexp.QuoteMeta(marker)) 470 } 471 472 // 匹配连续的句末标点符号 473 regexPattern := fmt.Sprintf(`([%s]+)`, strings.Join(patterns, "")) 474 regex := regexp.MustCompile(regexPattern) 475 476 // 在标点符号后添加分隔符 477 text = regex.ReplaceAllString(text, "${1}\uE001") 478 479 // 按分隔符分割 480 parts := strings.Split(text, "\uE001") 481 482 var segments []string 483 for _, part := range parts { 484 trimmed := strings.TrimSpace(part) 485 if trimmed != "" { 486 segments = append(segments, trimmed) 487 } 488 } 489 490 return segments 491 } 492 493 // countEffectiveChars 统计有效字符数(排除标点和空格) 494 func CountEffectiveChars(text string) int { 495 effectiveText := regexp.MustCompile(`[^\p{L}\p{N}]`).ReplaceAllString(text, "") 496 return len([]rune(effectiveText)) 497 } 498 499 // splitByAllPunctuation 按所有标点符号分割文本 500 func splitByAllPunctuation(text string) []string { 501 // 注意:这里的text已经在SplitTextSentences中被保护过了,不需要再次保护 502 503 // 定义分割标点符号(包括中英文标点) 504 punctuationMarkers := []string{ 505 // 句末标点 506 ".", "!", "?", ";", "。", "!", "?", ";", 507 // 句内标点(也要分割) 508 ",", ",", ";", 509 // 换行符 510 "\n", "\r\n", 511 } 512 513 // 创建正则表达式模式 514 var patterns []string 515 for _, marker := range punctuationMarkers { 516 patterns = append(patterns, regexp.QuoteMeta(marker)) 517 } 518 519 // 匹配连续的标点符号 520 regexPattern := fmt.Sprintf(`([%s]+)`, strings.Join(patterns, "")) 521 regex := regexp.MustCompile(regexPattern) 522 523 // 在标点符号后添加分隔符 524 text = regex.ReplaceAllString(text, "${1}\uE001") 525 526 // 按分隔符分割 527 parts := strings.Split(text, "\uE001") 528 529 var segments []string 530 for _, part := range parts { 531 trimmed := strings.TrimSpace(part) 532 if trimmed != "" { 533 segments = append(segments, trimmed) 534 } 535 } 536 537 return segments 538 } 539 540 // mergeShortSentences 合并过短的句子 541 // maxChars: 最小字符数,句子小于此值时考虑合并 542 // maxChars: 最大字符数,合并后的句子不能超过此值 543 func mergeShortSentences(segments []string, minChars, maxChars int) []string { 544 if len(segments) == 0 { 545 return segments 546 } 547 548 var result []string 549 var current strings.Builder 550 551 for i, segment := range segments { 552 segment = strings.TrimSpace(segment) 553 if segment == "" { 554 continue 555 } 556 557 // 添加到当前句子 558 if current.Len() > 0 { 559 current.WriteString(" ") 560 } 561 current.WriteString(segment) 562 563 currentText := current.String() 564 currentEffectiveChars := CountEffectiveChars(currentText) 565 566 // 检查是否应该合并下一个片段 567 shouldMerge := false 568 if i < len(segments)-1 { // 还有下一个片段 569 nextSegment := strings.TrimSpace(segments[i+1]) 570 if nextSegment != "" { 571 // 计算合并后的长度 572 potentialMerged := currentText + " " + nextSegment 573 mergedEffectiveChars := CountEffectiveChars(potentialMerged) 574 575 // 只有当前句子小于minChars,并且合并后不超过maxChars才合并 576 shouldMerge = currentEffectiveChars < minChars && mergedEffectiveChars <= maxChars 577 } 578 } 579 580 if !shouldMerge { 581 // 不合并,输出当前句子并重置 582 result = append(result, strings.TrimSpace(currentText)) 583 current.Reset() 584 } 585 // 如果shouldMerge为true,继续循环到下一个片段进行合并 586 } 587 588 // 处理最后的片段 589 if current.Len() > 0 { 590 result = append(result, strings.TrimSpace(current.String())) 591 } 592 593 return result 594 } 595 596 // isTooShort 判断句子是否过短需要合并 597 func isTooShort(text string, maxChars int) bool { 598 text = strings.TrimSpace(text) 599 600 // 计算有效字符数(排除标点和空格) 601 effectiveChars := CountEffectiveChars(text) 602 603 // 如果有效字符少于最小字符数,认为过短 604 if effectiveChars < maxChars { 605 return true 606 } 607 608 // 如果只有一个单词,也认为过短(除非已经达到最小字符数) 609 words := strings.Fields(text) 610 return len(words) <= 1 && effectiveChars < maxChars 611 } 612 613 // restoreProtectedPatterns 恢复被保护的模式 614 func restoreProtectedPatterns(text string) string { 615 for placeholder, original := range protectedPatterns { 616 text = strings.ReplaceAll(text, placeholder, original) 617 } 618 return text 619 } 620 621 // 将start和end转换为指定格式 622 func ConvertTimes(start, end float32) string { 623 startTime := FormatTime(start) 624 endTime := FormatTime(end) 625 return fmt.Sprintf("%s --> %s", startTime, endTime) 626 }