office_load.py 873 B

1234567891011121314151617181920212223242526
  1. import os
  2. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  3. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  4. from magic_pdf.data.read_api import read_local_office
  5. class MinerUParseOffice():
  6. # def __init__(self, knowledge_id):
  7. # self.knowledge_id = knowledge_id
  8. async def extract_text(self, file_path):
  9. local_image_dir = "./tmp_file/images"
  10. image_dir = str(os.path.basename(local_image_dir))
  11. os.makedirs(local_image_dir, exist_ok=True)
  12. image_writer = FileBasedDataWriter(local_image_dir)
  13. ds = read_local_office(file_path)[0] #
  14. infer_result = ds.apply(doc_analyze, ocr=True)
  15. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  16. content_list_content = pipe_result.get_content_list(image_dir)
  17. return content_list_content