/ pkg / util / subtitle.go
subtitle.go
  1  package util
  2  
  3  import (
  4  	"bufio"
  5  	"fmt"
  6  	"krillin-ai/internal/storage"
  7  	"krillin-ai/internal/types"
  8  	"os"
  9  	"os/exec"
 10  	"path/filepath"
 11  	"regexp"
 12  	"strconv"
 13  	"strings"
 14  	"unicode"
 15  )
 16  
 17  // 处理每一个字幕块
 18  func ProcessBlock(block []string, targetLanguageFile, targetLanguageTextFile, originLanguageFile, originLanguageTextFile *os.File, isTargetOnTop bool) {
 19  	var targetLines, originLines []string
 20  	// 匹配时间戳的正则表达式
 21  	timePattern := regexp.MustCompile(`\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}`)
 22  	for _, line := range block {
 23  		if timePattern.MatchString(line) || IsNumber(line) {
 24  			// 时间戳和编号行保留在两个文件中
 25  			targetLines = append(targetLines, line)
 26  			originLines = append(originLines, line)
 27  			continue
 28  		}
 29  		if len(targetLines) == 2 && len(originLines) == 2 { // 刚写完编号和时间戳,到了上方的文字行
 30  			if isTargetOnTop {
 31  				targetLines = append(targetLines, line)
 32  				targetLanguageTextFile.WriteString(line + " ") // 文稿文件
 33  			} else {
 34  				originLines = append(originLines, line)
 35  				originLanguageTextFile.WriteString(line + " ")
 36  			}
 37  			continue
 38  		}
 39  		// 到了下方的文字行
 40  		if isTargetOnTop {
 41  			originLines = append(originLines, line)
 42  			originLanguageTextFile.WriteString(line + " ")
 43  		} else {
 44  			targetLines = append(targetLines, line)
 45  			targetLanguageTextFile.WriteString(line + " ")
 46  		}
 47  	}
 48  
 49  	if len(targetLines) > 2 {
 50  		// 写入目标语言文件
 51  		for _, line := range targetLines {
 52  			targetLanguageFile.WriteString(line + "\n")
 53  		}
 54  		targetLanguageFile.WriteString("\n")
 55  	}
 56  
 57  	if len(originLines) > 2 {
 58  		// 写入源语言文件
 59  		for _, line := range originLines {
 60  			originLanguageFile.WriteString(line + "\n")
 61  		}
 62  		originLanguageFile.WriteString("\n")
 63  	}
 64  }
 65  
 66  // IsSubtitleText 是否是字幕文件中的字幕文字行
 67  func IsSubtitleText(line string) bool {
 68  	if line == "" {
 69  		return false
 70  	}
 71  	if IsNumber(line) {
 72  		return false
 73  	}
 74  	timelinePattern := regexp.MustCompile(`\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}`)
 75  	return !timelinePattern.MatchString(line)
 76  }
 77  
 78  type Format struct {
 79  	Duration string `json:"duration"`
 80  }
 81  
 82  type ProbeData struct {
 83  	Format Format `json:"format"`
 84  }
 85  
 86  type SrtBlock struct {
 87  	Index                  int
 88  	Timestamp              string
 89  	TargetLanguageSentence string
 90  	OriginLanguageSentence string
 91  }
 92  
 93  func TrimString(s string) string {
 94  	s = strings.Replace(s, "[中文翻译]", "", -1)
 95  	s = strings.Replace(s, "[英文句子]", "", -1)
 96  	// 去除开头的空格和 '['
 97  	s = strings.TrimLeft(s, " [")
 98  
 99  	// 去除结尾的空格和 ']'
100  	s = strings.TrimRight(s, " ]")
101  
102  	//替换中文单引号
103  	s = strings.ReplaceAll(s, "’", "'")
104  
105  	return s
106  }
107  
108  func SplitSentence(sentence string) []string {
109  	// 使用正则表达式移除标点符号和特殊字符(保留各语言字母、数字和空格)
110  	re := regexp.MustCompile(`[^\p{L}\p{N}\s']+`)
111  	cleanedSentence := re.ReplaceAllString(sentence, " ")
112  
113  	// 使用 strings.Fields 按空格拆分成单词
114  	words := strings.Fields(cleanedSentence)
115  
116  	return words
117  }
118  
119  func MergeFile(finalFile string, files ...string) error {
120  	// 创建最终文件
121  	final, err := os.Create(finalFile)
122  	if err != nil {
123  		return err
124  	}
125  
126  	// 逐个读取文件并写入最终文件
127  	for _, file := range files {
128  		f, err := os.Open(file)
129  		if err != nil {
130  			return err
131  		}
132  		defer f.Close()
133  
134  		scanner := bufio.NewScanner(f)
135  		for scanner.Scan() {
136  			line := scanner.Text()
137  			final.WriteString(line + "\n")
138  		}
139  	}
140  
141  	return nil
142  }
143  
144  func MergeSrtFiles(finalFile string, files ...string) error {
145  	output, err := os.Create(finalFile)
146  	if err != nil {
147  		return err
148  	}
149  	defer output.Close()
150  	writer := bufio.NewWriter(output)
151  	lineNumber := 0
152  	for _, file := range files {
153  		// 不存在某一个file就跳过
154  		if _, err = os.Stat(file); os.IsNotExist(err) {
155  			continue
156  		}
157  		// 打开当前字幕文件
158  		f, err := os.Open(file)
159  		if err != nil {
160  			return err
161  		}
162  		defer f.Close()
163  		// 处理当前字幕文件
164  		scanner := bufio.NewScanner(f)
165  		for scanner.Scan() {
166  			line := scanner.Text()
167  
168  			if strings.Contains(line, "```") {
169  				continue
170  			}
171  
172  			if IsNumber(line) {
173  				lineNumber++
174  				line = strconv.Itoa(lineNumber)
175  			}
176  
177  			writer.WriteString(line + "\n")
178  		}
179  	}
180  	writer.Flush()
181  
182  	return nil
183  }
184  
185  // 给定文件和替换map,将文件中所有的key替换成value
186  func ReplaceFileContent(srcFile, dstFile string, replacements map[string]string) error {
187  	file, err := os.Open(srcFile)
188  	if err != nil {
189  		return err
190  	}
191  	defer file.Close()
192  
193  	outFile, err := os.Create(dstFile)
194  	if err != nil {
195  		return err
196  	}
197  	defer outFile.Close()
198  
199  	scanner := bufio.NewScanner(file)
200  	writer := bufio.NewWriter(outFile) // 提高性能
201  	defer writer.Flush()
202  
203  	for scanner.Scan() {
204  		line := scanner.Text()
205  		for before, after := range replacements {
206  			line = strings.ReplaceAll(line, before, after)
207  		}
208  		_, _ = writer.WriteString(line + "\n")
209  	}
210  
211  	if err = scanner.Err(); err != nil {
212  		return err
213  	}
214  
215  	return nil
216  }
217  
218  // 获得文件名后加上后缀的新文件名,不改变扩展名,例如:/home/ubuntu/abc.srt变成/home/ubuntu/abc_tmp.srt
219  func AddSuffixToFileName(filePath, suffix string) string {
220  	dir := filepath.Dir(filePath)
221  	ext := filepath.Ext(filePath)
222  	name := strings.TrimSuffix(filepath.Base(filePath), ext)
223  	newName := fmt.Sprintf("%s%s%s", name, suffix, ext)
224  	return filepath.Join(dir, newName)
225  }
226  
227  // 去除字符串中的标点符号等字符,确保字符中的内容都是whisper模型可以识别出来的,便于时间戳对齐
228  func GetRecognizableString(s string) string {
229  	var result []rune
230  	for _, v := range s {
231  		// 英文字母和数字
232  		if unicode.Is(unicode.Latin, v) || unicode.Is(unicode.Number, v) {
233  			result = append(result, v)
234  		}
235  		// 中文
236  		if unicode.Is(unicode.Han, v) {
237  			result = append(result, v)
238  		}
239  		// 韩文
240  		if unicode.Is(unicode.Hangul, v) {
241  			result = append(result, v)
242  		}
243  		// 日文平假片假
244  		if unicode.Is(unicode.Hiragana, v) || unicode.Is(unicode.Katakana, v) {
245  			result = append(result, v)
246  		}
247  	}
248  	return string(result)
249  }
250  
251  func GetAudioDuration(inputFile string) (float64, error) {
252  	// 使用 ffprobe 获取精确时长
253  	cmd := exec.Command(storage.FfprobePath, "-i", inputFile, "-show_entries", "format=duration", "-v", "quiet", "-of", "csv=p=0")
254  	cmdOutput, err := cmd.Output()
255  	if err != nil {
256  		return 0, fmt.Errorf("GetAudioDuration failed to get audio duration: %w", err)
257  	}
258  
259  	// 解析时长
260  	duration, err := strconv.ParseFloat(strings.TrimSpace(string(cmdOutput)), 64)
261  	if err != nil {
262  		return 0, fmt.Errorf("GetAudioDuration failed to parse audio duration: %w", err)
263  	}
264  
265  	return duration, nil
266  }
267  
268  // todo 后续再补充
269  func IsAsianLanguage(code types.StandardLanguageCode) bool {
270  	return code == types.LanguageNameSimplifiedChinese || code == types.LanguageNameTraditionalChinese || code == types.LanguageNameJapanese || code == types.LanguageNameKorean || code == types.LanguageNameThai
271  }
272  
273  func BeautifyAsianLanguageSentence(input string) string {
274  	if len(input) == 0 {
275  		return input
276  	}
277  
278  	// 不处理的
279  	pairPunctuations := map[rune]rune{
280  		'「': '」', '『': '』', '“': '”', '‘': '’',
281  		'《': '》', '<': '>', '【': '】', '〔': '〕',
282  		'(': ')', '[': ']', '{': '}',
283  	}
284  
285  	// 需要处理的单标点
286  	singlePunctuations := ",.;:!?~,、。!?;:…"
287  
288  	// 先处理字符串末尾的标点
289  	runes := []rune(input)
290  	i := len(runes) - 1
291  	for i >= 0 {
292  		r := runes[i]
293  		// 如果是空格,继续检查前一个字符
294  		if unicode.IsSpace(r) {
295  			i--
296  			continue
297  		}
298  		// 如果是单标点,去除
299  		if strings.ContainsRune(singlePunctuations, r) {
300  			runes = runes[:i]
301  			i--
302  		} else {
303  			// 遇到非标点或成对标点,停止
304  			break
305  		}
306  	}
307  
308  	// 中间的单标点替换为空格
309  	var inPair bool
310  	var expectedClose rune
311  	var result []rune
312  
313  	for i := 0; i < len(runes); i++ {
314  		r := runes[i]
315  
316  		// 检查是否在成对标点内
317  		if inPair {
318  			if r == expectedClose {
319  				inPair = false
320  			}
321  			result = append(result, r)
322  			continue
323  		}
324  
325  		// 检查是否是成对标点的开始
326  		if close, isPair := pairPunctuations[r]; isPair {
327  			inPair = true
328  			expectedClose = close
329  			result = append(result, r)
330  			continue
331  		}
332  
333  		// 检查是否是数字中的小数点
334  		if r == '.' && i > 0 && i < len(runes)-1 {
335  			prev := runes[i-1]
336  			next := runes[i+1]
337  			if unicode.IsDigit(prev) && unicode.IsDigit(next) {
338  				result = append(result, r)
339  				continue
340  			}
341  		}
342  
343  		// 处理单标点
344  		if strings.ContainsRune(singlePunctuations, r) {
345  			// 替换为空格,但避免连续空格
346  			if len(result) > 0 && !unicode.IsSpace(result[len(result)-1]) {
347  				result = append(result, ' ')
348  			}
349  		} else {
350  			result = append(result, r)
351  		}
352  	}
353  
354  	return strings.TrimSpace(string(result))
355  }
356  
357  // SplitTextSentences 将文本按常见的半全角分隔符号切分成句子,会考虑一些特殊的不用切分的情况
358  // maxChars: 最小字符数,完整句子小于此字符数时不切割,否则连逗号也要切割
359  // 使用示例:
360  //
361  //	SplitTextSentences("你好,世界!", 5)  // 返回: ["你好,世界!"] (不切割,因为总字符数<5)
362  //	SplitTextSentences("这是一个很长的句子,包含很多内容。", 10) // 返回: ["这是一个很长的句子", "包含很多内容。"] (切割逗号)
363  func SplitTextSentences(text string, maxChars int) []string {
364  	if strings.TrimSpace(text) == "" {
365  		return []string{}
366  	}
367  
368  	// 第一步:保护特殊模式(数字、时间、缩写等)
369  	text = protectSpecialNumbers(text)
370  
371  	// 第二步:智能切割 - 首先按完整句子分割
372  	completeSentences := splitByCompleteSentences(text)
373  
374  	var result []string
375  	for _, sentence := range completeSentences {
376  		sentence = strings.TrimSpace(sentence)
377  		if sentence == "" {
378  			continue
379  		}
380  
381  		// 统计有效字符数(排除标点和空格)
382  		effectiveChars := CountEffectiveChars(sentence)
383  
384  		// 如果完整句子小于最小字符数,不切割
385  		if effectiveChars < maxChars {
386  			cleaned := restoreProtectedPatterns(sentence)
387  			result = append(result, strings.TrimSpace(cleaned))
388  		} else {
389  			// 完整句子过长,需要进一步按逗号等标点切割
390  			subSentences := splitByAllPunctuation(sentence)
391  			merged := mergeShortSentences(subSentences, 20, maxChars)
392  
393  			for _, subSentence := range merged {
394  				cleaned := restoreProtectedPatterns(subSentence)
395  				cleaned = strings.TrimSpace(cleaned)
396  				if cleaned != "" {
397  					result = append(result, cleaned)
398  				}
399  			}
400  		}
401  	}
402  
403  	return result
404  }
405  
406  // protectedPatterns 存储被保护的模式
407  var protectedPatterns map[string]string
408  
409  // protectSpecialNumbers 保护数字、时间、缩写等不被误切
410  func protectSpecialNumbers(text string) string {
411  	protectedPatterns = make(map[string]string)
412  
413  	// 使用更直接的方法来保护列表编号模式
414  	// 先处理特定的模式,如 "1.value", "2.be", "3.give" 等
415  	listNumberPattern := regexp.MustCompile(`\b\d+\.[a-zA-Z]`)
416  	text = listNumberPattern.ReplaceAllStringFunc(text, func(match string) string {
417  		placeholder := fmt.Sprintf("\uE000%d\uE000", len(protectedPatterns))
418  		protectedPatterns[placeholder] = match
419  		return placeholder
420  	})
421  
422  	patterns := []struct {
423  		regex *regexp.Regexp
424  		name  string
425  	}{
426  		// 保护域名和网址(如 .com, .org, .net 等)
427  		{regexp.MustCompile(`\b[a-zA-Z0-9-]+\.(?:com|org|net|edu|gov|mil|int|co|io|ai|me|tv|fm|am|pm|uk|cn|jp|de|fr|it|es|ru|in|au|ca|br|mx|ar|cl|pe|ve|ec|py|uy|bo|gf|sr|gy|fk|gs|sh|ac|ad|ae|af|ag|al|am|an|ao|aq|as|at|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cc|cd|cf|cg|ch|ci|ck|cm|co|cr|cs|cu|cv|cx|cy|cz|dj|dk|dm|do|dz|eg|eh|er|et|eu|fi|fj|fk|fo|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|iq|ir|is|je|jm|jo|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|qa|re|ro|rs|rw|sa|sb|sc|sd|se|sg|si|sj|sk|sl|sm|sn|so|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tz|ua|ug|um|us|uy|uz|va|vc|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)\b`), "domain"},
428  		// 保护 a.m., p.m., A.M., P.M. 这类缩写
429  		{regexp.MustCompile(`(?i)\b[ap]\.m\.`), "ampm"},
430  		// 时间格式
431  		{regexp.MustCompile(`\b\d{1,2}[:\.]\d{2}\s*(?:[ap]\.?m\.?|AM|PM)?\b`), "time"},
432  		// 小数(包括多位小数)
433  		{regexp.MustCompile(`\b\d+\.\d+\b`), "decimal"},
434  		// 千位分隔符
435  		{regexp.MustCompile(`\b\d{1,3}(?:,\d{3})+(?:\.\d+)?\b`), "thousands"},
436  		// 版本号(如 1.0, 2.5.1 等)
437  		{regexp.MustCompile(`\b\d+(?:\.\d+)+\b`), "version"},
438  		// 英文缩写
439  		{regexp.MustCompile(`\b(?:[A-Z][a-z]*\.){2,}|(?:[A-Z]\.){2,}[A-Z]?\b`), "abbrev"},
440  		// Mr., Mrs., Dr. 等称谓
441  		{regexp.MustCompile(`\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr)\.`), "title"},
442  		// 列表编号(如 1., 2., 3. 等)- 数字+点+空格
443  		{regexp.MustCompile(`\b\d+\.\s`), "list_number_with_space"},
444  		// 字母编号(如 a., b., c. 等)
445  		{regexp.MustCompile(`\b[a-zA-Z]\.\s`), "letter_number_with_space"},
446  	}
447  
448  	for _, pattern := range patterns {
449  		text = pattern.regex.ReplaceAllStringFunc(text, func(match string) string {
450  			placeholder := fmt.Sprintf("\uE000%d\uE000", len(protectedPatterns))
451  			protectedPatterns[placeholder] = match
452  			return placeholder
453  		})
454  	}
455  
456  	return text
457  }
458  
459  // splitByCompleteSentences 按完整句子标点分割(句号、感叹号、问号等)
460  func splitByCompleteSentences(text string) []string {
461  	// 只按句末标点分割,不包含逗号
462  	completeSentenceMarkers := []string{
463  		".", "!", "?", "。", "!", "?", ";", "\n", "\r\n",
464  	}
465  
466  	// 创建正则表达式模式
467  	var patterns []string
468  	for _, marker := range completeSentenceMarkers {
469  		patterns = append(patterns, regexp.QuoteMeta(marker))
470  	}
471  
472  	// 匹配连续的句末标点符号
473  	regexPattern := fmt.Sprintf(`([%s]+)`, strings.Join(patterns, ""))
474  	regex := regexp.MustCompile(regexPattern)
475  
476  	// 在标点符号后添加分隔符
477  	text = regex.ReplaceAllString(text, "${1}\uE001")
478  
479  	// 按分隔符分割
480  	parts := strings.Split(text, "\uE001")
481  
482  	var segments []string
483  	for _, part := range parts {
484  		trimmed := strings.TrimSpace(part)
485  		if trimmed != "" {
486  			segments = append(segments, trimmed)
487  		}
488  	}
489  
490  	return segments
491  }
492  
493  // countEffectiveChars 统计有效字符数(排除标点和空格)
494  func CountEffectiveChars(text string) int {
495  	effectiveText := regexp.MustCompile(`[^\p{L}\p{N}]`).ReplaceAllString(text, "")
496  	return len([]rune(effectiveText))
497  }
498  
499  // splitByAllPunctuation 按所有标点符号分割文本
500  func splitByAllPunctuation(text string) []string {
501  	// 注意:这里的text已经在SplitTextSentences中被保护过了,不需要再次保护
502  
503  	// 定义分割标点符号(包括中英文标点)
504  	punctuationMarkers := []string{
505  		// 句末标点
506  		".", "!", "?", ";", "。", "!", "?", ";",
507  		// 句内标点(也要分割)
508  		",", ",", ";",
509  		// 换行符
510  		"\n", "\r\n",
511  	}
512  
513  	// 创建正则表达式模式
514  	var patterns []string
515  	for _, marker := range punctuationMarkers {
516  		patterns = append(patterns, regexp.QuoteMeta(marker))
517  	}
518  
519  	// 匹配连续的标点符号
520  	regexPattern := fmt.Sprintf(`([%s]+)`, strings.Join(patterns, ""))
521  	regex := regexp.MustCompile(regexPattern)
522  
523  	// 在标点符号后添加分隔符
524  	text = regex.ReplaceAllString(text, "${1}\uE001")
525  
526  	// 按分隔符分割
527  	parts := strings.Split(text, "\uE001")
528  
529  	var segments []string
530  	for _, part := range parts {
531  		trimmed := strings.TrimSpace(part)
532  		if trimmed != "" {
533  			segments = append(segments, trimmed)
534  		}
535  	}
536  
537  	return segments
538  }
539  
540  // mergeShortSentences 合并过短的句子
541  // maxChars: 最小字符数,句子小于此值时考虑合并
542  // maxChars: 最大字符数,合并后的句子不能超过此值
543  func mergeShortSentences(segments []string, minChars, maxChars int) []string {
544  	if len(segments) == 0 {
545  		return segments
546  	}
547  
548  	var result []string
549  	var current strings.Builder
550  
551  	for i, segment := range segments {
552  		segment = strings.TrimSpace(segment)
553  		if segment == "" {
554  			continue
555  		}
556  
557  		// 添加到当前句子
558  		if current.Len() > 0 {
559  			current.WriteString(" ")
560  		}
561  		current.WriteString(segment)
562  
563  		currentText := current.String()
564  		currentEffectiveChars := CountEffectiveChars(currentText)
565  
566  		// 检查是否应该合并下一个片段
567  		shouldMerge := false
568  		if i < len(segments)-1 { // 还有下一个片段
569  			nextSegment := strings.TrimSpace(segments[i+1])
570  			if nextSegment != "" {
571  				// 计算合并后的长度
572  				potentialMerged := currentText + " " + nextSegment
573  				mergedEffectiveChars := CountEffectiveChars(potentialMerged)
574  
575  				// 只有当前句子小于minChars,并且合并后不超过maxChars才合并
576  				shouldMerge = currentEffectiveChars < minChars && mergedEffectiveChars <= maxChars
577  			}
578  		}
579  
580  		if !shouldMerge {
581  			// 不合并,输出当前句子并重置
582  			result = append(result, strings.TrimSpace(currentText))
583  			current.Reset()
584  		}
585  		// 如果shouldMerge为true,继续循环到下一个片段进行合并
586  	}
587  
588  	// 处理最后的片段
589  	if current.Len() > 0 {
590  		result = append(result, strings.TrimSpace(current.String()))
591  	}
592  
593  	return result
594  }
595  
596  // isTooShort 判断句子是否过短需要合并
597  func isTooShort(text string, maxChars int) bool {
598  	text = strings.TrimSpace(text)
599  
600  	// 计算有效字符数(排除标点和空格)
601  	effectiveChars := CountEffectiveChars(text)
602  
603  	// 如果有效字符少于最小字符数,认为过短
604  	if effectiveChars < maxChars {
605  		return true
606  	}
607  
608  	// 如果只有一个单词,也认为过短(除非已经达到最小字符数)
609  	words := strings.Fields(text)
610  	return len(words) <= 1 && effectiveChars < maxChars
611  }
612  
613  // restoreProtectedPatterns 恢复被保护的模式
614  func restoreProtectedPatterns(text string) string {
615  	for placeholder, original := range protectedPatterns {
616  		text = strings.ReplaceAll(text, placeholder, original)
617  	}
618  	return text
619  }
620  
621  // 将start和end转换为指定格式
622  func ConvertTimes(start, end float32) string {
623  	startTime := FormatTime(start)
624  	endTime := FormatTime(end)
625  	return fmt.Sprintf("%s --> %s", startTime, endTime)
626  }