util.py
1 import re 2 import unicodedata 3 4 invalid_rendering = ["🕵️", "☝"] 5 6 def strip_modifiers(text): 7 def process_characters(text): 8 result = [] 9 i = 0 10 while i < len(text): 11 char = text[i] 12 category = unicodedata.category(char) 13 14 if category.startswith(('L', 'N', 'P', 'S')): 15 result.append(char) 16 i += 1 17 elif category.startswith(('M', 'Sk', 'Cf')) or char in '\u200d\u200c': 18 i += 1 19 else: 20 result.append(char) 21 i += 1 22 23 return ''.join(result) 24 25 for char in invalid_rendering: 26 text = text.replace(char, " ") 27 28 stripped = process_characters(text) 29 stripped = re.sub(r'[\uFE00-\uFE0F]', '', stripped) 30 stripped = re.sub(r'[\U000E0100-\U000E01EF]', '', stripped, flags=re.UNICODE) 31 stripped = re.sub(r'[\U0001F3FB-\U0001F3FF]', '', stripped, flags=re.UNICODE) 32 stripped = re.sub(r'[\u200D\u200C]', '', stripped) 33 stripped = re.sub(r'\r\n?', '\n', stripped) 34 35 return stripped