convert_unicode_to_chinese.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # -*- coding: utf-8 -*-
  2. """Convert unicode escape sequences to Chinese characters in source files."""
  3. import re
  4. import os
  5. import codecs
  6. BASE_DIR = r'f:\Workspace2016\BidiRAG'
  7. def decode_unicode_in_file(filepath):
  8. """Convert all \\uXXXX sequences in a file to actual Unicode characters."""
  9. with open(filepath, 'r', encoding='utf-8') as f:
  10. content = f.read()
  11. if not re.search(r'\\u[0-9a-fA-F]{4}', content):
  12. return False
  13. def replace_unicode(match):
  14. try:
  15. return chr(int(match.group(0)[2:], 16))
  16. except:
  17. return match.group(0)
  18. new_content = re.sub(r'\\u[0-9a-fA-F]{4}', replace_unicode, content)
  19. with open(filepath, 'w', encoding='utf-8') as f:
  20. f.write(new_content)
  21. return True
  22. files_to_convert = [
  23. os.path.join(BASE_DIR, 'bdirag', 'rag_methods.py'),
  24. os.path.join(BASE_DIR, 'bdirag', 'config.py'),
  25. os.path.join(BASE_DIR, 'examples', 'test_bm25.py'),
  26. ]
  27. for filepath in files_to_convert:
  28. if os.path.exists(filepath):
  29. changed = decode_unicode_in_file(filepath)
  30. if changed:
  31. print("Converted: {}".format(filepath))
  32. else:
  33. print("No changes needed: {}".format(filepath))
  34. else:
  35. print("File not found: {}".format(filepath))
  36. print("Done!")