office_load.py 907 B

1234567891011121314151617181920212223242526
  1. # import os
  2. # from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  3. # from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  4. # from magic_pdf.data.read_api import read_local_office
  5. # class MinerUParseOffice():
  6. # # def __init__(self, knowledge_id):
  7. # # self.knowledge_id = knowledge_id
  8. # async def extract_text(self, file_path):
  9. # local_image_dir = "./tmp_file/images"
  10. # image_dir = str(os.path.basename(local_image_dir))
  11. # os.makedirs(local_image_dir, exist_ok=True)
  12. # image_writer = FileBasedDataWriter(local_image_dir)
  13. # ds = read_local_office(file_path)[0] #
  14. # infer_result = ds.apply(doc_analyze, ocr=True)
  15. # pipe_result = infer_result.pipe_ocr_mode(image_writer)
  16. # content_list_content = pipe_result.get_content_list(image_dir)
  17. # return content_list_content