exporter_html.py
1 import copy 2 import html 3 import json 4 import math 5 import os 6 import shutil 7 import time 8 from wxManager.decrypt.decrypt_dat import batch_decode_image_multiprocessing 9 from wxManager.log import logger 10 from wxManager.model import MessageType, Me 11 from exporter.exporter import ExporterBase, copy_files, decode_audios, get_new_filename 12 13 icon_files = { 14 'DOCX': ['doc', 'docx'], 15 'XLS': ['xls', 'xlsx'], 16 'CSV': ['csv'], 17 'TXT': ['txt'], 18 'ZIP': ['zip', '7z', 'rar'], 19 'PPT': ['ppt', 'pptx'], 20 'PDF': ['pdf'], 21 } 22 23 24 class HtmlExporter(ExporterBase): 25 26 def export(self): 27 print(f"【开始导出 HTML {self.contact.remark}】") 28 f_name = '.html' 29 filename = os.path.join(self.origin_path, f'{self.contact.remark}{f_name}') 30 filename = get_new_filename(filename) 31 # 获取当前脚本的目录 32 current_dir = os.path.dirname(os.path.abspath(__file__)) 33 # 构建要读取的文件路径 34 file_path = os.path.join(current_dir, 'resources', 'template.html') 35 shutil.copytree(os.path.join(current_dir, 'resources', 'emoji'), os.path.join(self.origin_path, 'emoji'),dirs_exist_ok=True) 36 with open(file_path, "r", encoding="utf-8") as f: 37 content = f.read() 38 html_head, html_end = content.split('/*注意看这是分割线*/') 39 f = open(filename, 'w', encoding='utf-8') 40 html_head = html_head.replace("<title>出错了</title>", f"<title>{self.contact.remark}</title>") 41 html_head = html_head.replace("<p id=\"title\">出错了</p>", f"<p id=\"title\">{self.contact.remark}</p>") 42 # avatar_urls, avatar_paths = self.get_avatar_urls() 43 avatar_urls = [] 44 avatar_paths = [] 45 html_head = html_head.replace("{{avatarPaths}}", json.dumps(avatar_paths)) 46 html_head = html_head.replace("{{avatarUrls}}", json.dumps(avatar_urls)).replace('{{wxid}}', 47 f'"{self.contact.wxid}"') 48 f.write(html_head) 49 messages = self.database.get_messages(self.contact.wxid, time_range=self.time_range) 50 51 # QMe().save_avatar(self.origin_path + '/avatar/' + Me().wxid + '.png') 52 # self.contact.save_avatar(self.origin_path + '/avatar/' + self.contact.wxid + '.png') 53 date_id_map = {} 54 timelineData = {} 55 PageTimeline = {} 56 server_id_Page = {} 57 server_id_Idx = {} 58 59 AllIndex = [] 60 ImageIndex = [] 61 FileIndex = [] 62 LinkIndex = [] 63 MusicIndex = [] 64 TransferIndex = [] 65 MiniProgramIndex = [] 66 VideoNumberIndex = [] 67 dateDataMap = {} 68 i = 0 69 itemsPerPage = 100 70 num = 1 71 html_json = [] 72 image_tasks = [] 73 video_tasks = [] 74 file_tasks = [] 75 audio_tasks = [] 76 image_dir = os.path.join(self.origin_path, 'image') 77 video_dir = os.path.join(self.origin_path, 'video') 78 audio_dir = os.path.join(self.origin_path, 'voice') 79 file_dir = os.path.join(self.origin_path, 'file') 80 total_steps = len(messages) 81 select_msg_cnt = 0 # 要导出的消息数量 82 msg_index = 0 83 84 def parser_merged(merged_message): 85 for msg in merged_message.messages: 86 type_ = msg.type 87 if type_ == MessageType.Image: 88 msg.set_file_name() 89 image_tasks.append( 90 ( 91 os.path.join(Me().wx_dir, msg.path), 92 os.path.join(image_dir, msg.str_time[:7]), 93 msg.file_name 94 ) 95 ) 96 image_tasks.append( 97 ( 98 os.path.join(Me().wx_dir, msg.thumb_path), 99 os.path.join(image_dir, msg.str_time[:7]), 100 msg.file_name + '_t' 101 ) 102 ) 103 msg.path = f"./image/{msg.str_time[:7]}/{msg.file_name}" 104 msg.thumb_path = f"./image/{msg.str_time[:7]}/{msg.file_name + '_t'}" 105 elif type_ == MessageType.File: 106 origin_file_path = os.path.join(Me().wx_dir, msg.path) 107 file_tasks.append( 108 ( 109 origin_file_path, 110 os.path.join(file_dir, msg.str_time[:7]), 111 '' 112 ) 113 ) 114 msg.path = f'./file/{msg.str_time[:7]}/{os.path.basename(origin_file_path)}' 115 elif type_ == MessageType.Video: 116 msg.set_file_name() 117 video_tasks.append( 118 ( 119 os.path.join(Me().wx_dir, msg.path), 120 os.path.join(video_dir, msg.str_time[:7]), 121 msg.file_name 122 ) 123 ) 124 ext = os.path.basename(msg.path).split('.')[-1] 125 msg.path = f'./video/{msg.str_time[:7]}/{msg.file_name}.{ext}' 126 elif type_ == MessageType.MergedMessages: 127 parser_merged(msg) 128 129 for index, message in enumerate(messages): 130 if not self._is_running: 131 break 132 if index and index % 1000 == 0: 133 self.update_progress_callback(index / total_steps) 134 type_ = message.type 135 if not self.is_selected(message): 136 continue 137 server_id = message.server_id 138 if type_ == MessageType.Image: 139 ImageIndex.append(msg_index) 140 message.set_file_name() 141 image_tasks.append( 142 ( 143 os.path.join(Me().wx_dir, message.path), 144 os.path.join(image_dir, message.str_time[:7]), 145 message.file_name 146 ) 147 ) 148 image_tasks.append( 149 ( 150 os.path.join(Me().wx_dir, message.thumb_path), 151 os.path.join(image_dir, message.str_time[:7]), 152 message.file_name + '_t' 153 ) 154 ) 155 message.path = f"./image/{message.str_time[:7]}/{message.file_name}" 156 message.thumb_path = f"./image/{message.str_time[:7]}/{message.file_name + '_t'}" 157 elif type_ == MessageType.File: 158 FileIndex.append(msg_index) 159 origin_file_path = os.path.join(Me().wx_dir, message.path) 160 file_tasks.append( 161 ( 162 origin_file_path, 163 os.path.join(file_dir, message.str_time[:7]), 164 '' 165 ) 166 ) 167 if os.path.isfile(origin_file_path): 168 message.path = f'./file/{message.str_time[:7]}/{os.path.basename(origin_file_path)}' 169 elif type_ == MessageType.Video: 170 ImageIndex.append(msg_index) 171 message.set_file_name() 172 video_tasks.append( 173 ( 174 os.path.join(Me().wx_dir, message.path), 175 os.path.join(video_dir, message.str_time[:7]), 176 message.file_name 177 ) 178 ) 179 ext = os.path.basename(message.path).split('.')[-1] 180 message.path = f'./video/{message.str_time[:7]}/{message.file_name}.{ext}' 181 elif type_ == MessageType.Audio: 182 message.set_file_name() 183 audio_tasks.append( 184 ( 185 self.database.get_media_buffer(message.server_id, self.contact.is_public()), 186 os.path.join(audio_dir, message.str_time[:7]), 187 message.file_name 188 ) 189 ) 190 message.path = f'./voice/{message.str_time[:7]}/{message.file_name + ".mp3"}' 191 elif type_ == MessageType.LinkMessage or type_ == MessageType.LinkMessage2 or type_ == MessageType.LinkMessage4 or type_ == MessageType.LinkMessage5 or type_ == MessageType.LinkMessage6: 192 LinkIndex.append(msg_index) 193 elif type_ == MessageType.Music: 194 MusicIndex.append(msg_index) 195 elif type_ == MessageType.Transfer: 196 TransferIndex.append(msg_index) 197 elif type_ == MessageType.Applet or type_ == MessageType.Applet2: 198 MiniProgramIndex.append(msg_index) 199 elif type_ == MessageType.WeChatVideo: 200 VideoNumberIndex.append(msg_index) 201 elif type_ == MessageType.MergedMessages: 202 parser_merged(message) 203 msg_index += 1 204 is_select = True 205 html_json.append(message.to_json()) 206 if is_select: 207 select_msg_cnt += 1 208 # 把时间戳转换为格式化时间 209 str_time = message.str_time 210 # 2024-01-01 211 year = str_time[:4] 212 month = int(str_time[5:7]) 213 curpage = math.ceil(select_msg_cnt / itemsPerPage) 214 if str_time[:10] not in date_id_map: 215 date_id_map[str_time[:10]] = str(server_id) 216 if str_time[:10] not in dateDataMap: 217 dateDataMap[str_time[:10]] = [curpage, str(server_id)] 218 219 if year not in timelineData: 220 timelineData[year] = {} 221 if month not in timelineData[year]: 222 timelineData[year][month] = [] 223 timelineData[year][month].append(curpage) 224 timelineData[year][month].append(str(server_id)) 225 226 if curpage not in PageTimeline: 227 PageTimeline[curpage] = {} 228 PageTimeline[curpage]['year'] = year 229 PageTimeline[curpage]['month'] = month 230 231 server_id_Page[str(server_id)] = curpage 232 server_id_Idx[str(server_id)] = select_msg_cnt - 1 233 234 # print(image_tasks) 235 # print(file_tasks) 236 # print(video_tasks) 237 # print(audio_tasks) 238 logger.info('解析图片') 239 # 使用多进程,导出所有图片 240 batch_decode_image_multiprocessing(Me().xor_key, image_tasks) 241 print('开始复制文件') 242 logger.info(f'开始复制{len(video_tasks + file_tasks)}') 243 # 使用多线程,复制文件、视频到导出文件夹 244 copy_files(video_tasks + file_tasks) 245 print('开始导出语音') 246 logger.info('开始导出语音') 247 decode_audios(audio_tasks) 248 249 AllIndex = list(range(len(html_json))) 250 251 replace_map = { 252 "{{timelineData}}": timelineData, 253 "{{PageTimeline}}": PageTimeline, 254 "{{server_id_Page}}": server_id_Page, 255 "{{server_id_Idx}}": server_id_Idx, 256 "{{dateDataMap}}": dateDataMap, 257 "{{AllIndex}}": AllIndex, 258 "{{ImageIndex}}": ImageIndex, 259 "{{FileIndex}}": FileIndex, 260 "{{LinkIndex}}": LinkIndex, 261 "{{MusicIndex}}": MusicIndex, 262 "{{TransferIndex}}": TransferIndex, 263 "{{MiniProgramIndex}}": MiniProgramIndex, 264 "{{VideoNumberIndex}}": VideoNumberIndex 265 } 266 267 def dict_to_js(dic: dict): 268 for key, value in dic.items(): 269 if isinstance(value, str): 270 if value.startswith('http'): 271 dic[key] = value 272 else: 273 dic[key] = html.escape(value) 274 elif isinstance(value, dict): 275 dic[key] = dict_to_js(value) 276 return dic 277 278 print('开始字符串转义') 279 logger.info('开始字符串转义') 280 # 字符串转义,防止JS出现语法错误 281 html_data = [] 282 for item in copy.deepcopy(html_json): 283 html_data.append(dict_to_js(item)) 284 285 f.write(json.dumps(html_data, ensure_ascii=False, indent=4)) 286 for key, value in replace_map.items(): 287 html_end = html_end.replace(key, json.dumps(value)) 288 289 f.write(html_end) 290 f.close() 291 292 with open(filename + '.json', 'w', encoding='utf-8') as f: 293 json.dump(html_json, f, ensure_ascii=False, indent=4) 294 295 self.update_progress_callback(1) 296 print(f"【完成导出 HTML {self.contact.remark}】{len(messages)}") 297 self.finish_callback(self.exporter_id)