utils.py
 1  # Copyright (c) 2024-2026 Tencent Zhuque Lab. All rights reserved.
 2  #
 3  # Licensed under the Apache License, Version 2.0 (the "License");
 4  # you may not use this file except in compliance with the License.
 5  # You may obtain a copy of the License at
 6  #
 7  #     http://www.apache.org/licenses/LICENSE-2.0
 8  #
 9  # Unless required by applicable law or agreed to in writing, software
10  # distributed under the License is distributed on an "AS IS" BASIS,
11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  # See the License for the specific language governing permissions and
13  # limitations under the License.
14  #
15  # Requirement: Any integration or derivative work must explicitly attribute
16  # Tencent Zhuque Lab (https://github.com/Tencent/AI-Infra-Guard) in its
17  # documentation or user interface, as detailed in the NOTICE file.
18  
19  import re
20  from typing import Literal
21  
22  def judge_language(text: str, chinese_threshold: int = 0.5, english_threshold: int = 0.5) -> Literal["default", "chinese", "english"]:
23      if not text or not text.strip():
24          return 'default'
25      
26      chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
27      english_pattern = re.compile(r'[A-Za-z]')
28      
29      chinese_chars = chinese_pattern.findall(text)
30      english_chars = english_pattern.findall(text)
31      
32      total_chars = len([c for c in text if c.strip()])
33      
34      if total_chars == 0:
35          return 'default'
36      
37      chinese_ratio = len(chinese_chars) / total_chars
38      english_ratio = len(english_chars) / total_chars
39      
40      if chinese_ratio >= chinese_threshold:
41          return 'chinese'
42      elif english_ratio >= english_threshold:
43          return 'english'
44      else:
45          return 'default'