char_utils.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import re
  3. def is_hyphen_at_line_end(line):
  4. """Check if a line ends with one or more letters followed by a hyphen.
  5. Args:
  6. line (str): The line of text to check.
  7. Returns:
  8. bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
  9. """
  10. # Use regex to check if the line ends with one or more letters followed by a hyphen
  11. return bool(re.search(r'[A-Za-z]+-\s*$', line))
  12. def full_to_half_exclude_marks(text: str) -> str:
  13. """Convert full-width characters to half-width characters using code point manipulation.
  14. Args:
  15. text: String containing full-width characters
  16. Returns:
  17. String with full-width characters converted to half-width
  18. """
  19. result = []
  20. for char in text:
  21. code = ord(char)
  22. # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
  23. if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
  24. result.append(chr(code - 0xFEE0)) # Shift to ASCII range
  25. else:
  26. result.append(char)
  27. return ''.join(result)
  28. def full_to_half(text: str) -> str:
  29. """Convert full-width characters to half-width characters using code point manipulation.
  30. Args:
  31. text: String containing full-width characters
  32. Returns:
  33. String with full-width characters converted to half-width
  34. """
  35. result = []
  36. for char in text:
  37. code = ord(char)
  38. # Full-width letters, numbers and punctuation (FF01-FF5E)
  39. if 0xFF01 <= code <= 0xFF5E:
  40. result.append(chr(code - 0xFEE0)) # Shift to ASCII range
  41. else:
  42. result.append(char)
  43. return ''.join(result)