utils.py
1 # Copyright (c) 2024-2026 Tencent Zhuque Lab. All rights reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # 15 # Requirement: Any integration or derivative work must explicitly attribute 16 # Tencent Zhuque Lab (https://github.com/Tencent/AI-Infra-Guard) in its 17 # documentation or user interface, as detailed in the NOTICE file. 18 19 import re 20 from typing import Literal 21 22 def judge_language(text: str, chinese_threshold: int = 0.5, english_threshold: int = 0.5) -> Literal["default", "chinese", "english"]: 23 if not text or not text.strip(): 24 return 'default' 25 26 chinese_pattern = re.compile(r'[\u4e00-\u9fff]') 27 english_pattern = re.compile(r'[A-Za-z]') 28 29 chinese_chars = chinese_pattern.findall(text) 30 english_chars = english_pattern.findall(text) 31 32 total_chars = len([c for c in text if c.strip()]) 33 34 if total_chars == 0: 35 return 'default' 36 37 chinese_ratio = len(chinese_chars) / total_chars 38 english_ratio = len(english_chars) / total_chars 39 40 if chinese_ratio >= chinese_threshold: 41 return 'chinese' 42 elif english_ratio >= english_threshold: 43 return 'english' 44 else: 45 return 'default'