omniparse.py
1 import asyncio 2 3 from metagpt.config2 import config 4 from metagpt.const import EXAMPLE_DATA_PATH 5 from metagpt.logs import logger 6 from metagpt.rag.parsers import OmniParse 7 from metagpt.rag.schema import OmniParseOptions, OmniParseType, ParseResultType 8 from metagpt.utils.omniparse_client import OmniParseClient 9 10 TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx" 11 TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf" 12 TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4" 13 TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3" 14 15 16 async def omniparse_client_example(): 17 client = OmniParseClient(base_url=config.omniparse.base_url) 18 19 # docx 20 with open(TEST_DOCX, "rb") as f: 21 file_input = f.read() 22 document_parse_ret = await client.parse_document(file_input=file_input, bytes_filename="test_01.docx") 23 logger.info(document_parse_ret) 24 25 # pdf 26 pdf_parse_ret = await client.parse_pdf(file_input=TEST_PDF) 27 logger.info(pdf_parse_ret) 28 29 # video 30 video_parse_ret = await client.parse_video(file_input=TEST_VIDEO) 31 logger.info(video_parse_ret) 32 33 # audio 34 audio_parse_ret = await client.parse_audio(file_input=TEST_AUDIO) 35 logger.info(audio_parse_ret) 36 37 38 async def omniparse_example(): 39 parser = OmniParse( 40 api_key=config.omniparse.api_key, 41 base_url=config.omniparse.base_url, 42 parse_options=OmniParseOptions( 43 parse_type=OmniParseType.PDF, 44 result_type=ParseResultType.MD, 45 max_timeout=120, 46 num_workers=3, 47 ), 48 ) 49 ret = parser.load_data(file_path=TEST_PDF) 50 logger.info(ret) 51 52 file_paths = [TEST_DOCX, TEST_PDF] 53 parser.parse_type = OmniParseType.DOCUMENT 54 ret = await parser.aload_data(file_path=file_paths) 55 logger.info(ret) 56 57 58 async def main(): 59 await omniparse_client_example() 60 await omniparse_example() 61 62 63 if __name__ == "__main__": 64 asyncio.run(main())