guess_suffix_or_lang.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435
  1. from pathlib import Path
  2. from loguru import logger
  3. from magika import Magika
  4. DEFAULT_LANG = "txt"
  5. PDF_SIG_BYTES = b'%PDF'
  6. magika = Magika()
  7. def guess_language_by_text(code):
  8. codebytes = code.encode(encoding="utf-8")
  9. lang = magika.identify_bytes(codebytes).prediction.output.label
  10. return lang if lang != "unknown" else DEFAULT_LANG
  11. def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
  12. suffix = magika.identify_bytes(file_bytes).prediction.output.label
  13. if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"] and file_bytes[:4] == PDF_SIG_BYTES:
  14. suffix = "pdf"
  15. return suffix
  16. def guess_suffix_by_path(file_path) -> str:
  17. if not isinstance(file_path, Path):
  18. file_path = Path(file_path)
  19. suffix = magika.identify_path(file_path).prediction.output.label
  20. if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
  21. try:
  22. with open(file_path, 'rb') as f:
  23. if f.read(4) == PDF_SIG_BYTES:
  24. suffix = "pdf"
  25. except Exception as e:
  26. logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
  27. return suffix