# -*- coding: utf-8 -*- """Convert unicode escape sequences to Chinese characters in source files.""" import re import os import codecs BASE_DIR = r'f:\Workspace2016\BidiRAG' def decode_unicode_in_file(filepath): """Convert all \\uXXXX sequences in a file to actual Unicode characters.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() if not re.search(r'\\u[0-9a-fA-F]{4}', content): return False def replace_unicode(match): try: return chr(int(match.group(0)[2:], 16)) except: return match.group(0) new_content = re.sub(r'\\u[0-9a-fA-F]{4}', replace_unicode, content) with open(filepath, 'w', encoding='utf-8') as f: f.write(new_content) return True files_to_convert = [ os.path.join(BASE_DIR, 'bdirag', 'rag_methods.py'), os.path.join(BASE_DIR, 'bdirag', 'config.py'), os.path.join(BASE_DIR, 'examples', 'test_bm25.py'), ] for filepath in files_to_convert: if os.path.exists(filepath): changed = decode_unicode_in_file(filepath) if changed: print("Converted: {}".format(filepath)) else: print("No changes needed: {}".format(filepath)) else: print("File not found: {}".format(filepath)) print("Done!")