compare.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. '''
  2. Created on 2019年6月13日
  3. @author: User
  4. '''
  5. import fool
  6. from bi_lstm_crf import *
  7. import pandas as pd
  8. import codecs
  9. import re
  10. import requests
  11. from BiddingKG.dl.BertNer.BertCRF import *
  12. ''''''
  13. def compare(text):
  14. print(fool.ner(text))
  15. '''
  16. bilstm.initVariables()
  17. '''
  18. # init_op = tf.global_variables_initializer()
  19. # sess.run(init_op)
  20. # summaryWriter = tf.summary.FileWriter('log/', tf.get_default_graph())
  21. print(bilstm.ner(text))
  22. _ner_fool = fool.ner(text)
  23. _ner_selffool = bilstm.ner(text)
  24. if len(set(_ner_fool[0]) & set(_ner_selffool[0])) == len(_ner_fool[0]):
  25. print(set(fool.ner(text)[0]) & set(bilstm.ner(text)[0]))
  26. def dealNotFoundEntity():
  27. '''
  28. @summary: 处理未识别数据
  29. '''
  30. df = pd.read_excel("C:\\Users\\User\\Desktop\\无法分离实体名称.xlsx")
  31. list_newname_fool = []
  32. list_newname_selffool = []
  33. count = 0
  34. for _name in df["name"]:
  35. count += 1
  36. print(_name)
  37. if str(_name) == "nan":
  38. list_newname_fool.append("")
  39. list_newname_selffool.append("")
  40. continue
  41. print(count, len(df["name"]))
  42. _newname_fool = ""
  43. _newname_selffool = ""
  44. for _ner in fool.ner(_name)[0]:
  45. _newname_fool += _ner[3] + "##"
  46. for _ner in bilstm.ner(_name)[0]:
  47. _newname_selffool += _ner[3] + "##"
  48. list_newname_fool.append(_newname_fool[:-2])
  49. list_newname_selffool.append(_newname_selffool[:-2])
  50. data = {"id": df["id"],
  51. "area": df["area"],
  52. "province": df["province"],
  53. "city": df["city"],
  54. "district": df["district"],
  55. "name": df["name"],
  56. "newname_fool": list_newname_fool,
  57. "newname_selffool": list_newname_selffool}
  58. _df = pd.DataFrame(data, columns=["id", "area", "province", "city", "district", "name", "newname_fool",
  59. "newname_selffool"])
  60. _df.to_excel("C:\\Users\\User\\Desktop\\无法分离实体名称_deal.xls")
  61. def nerEntity():
  62. file = "C:\\Users\\User\\Desktop\\select_company_name_from_bxkc_C_CONTACT_.tsv"
  63. file_found = "C:\\Users\\User\\Desktop\\company_found.tsv"
  64. file_notfound = "C:\\Users\\User\\Desktop\\company_notfound.tsv"
  65. with codecs.open(file, "r", encoding="utf8") as f:
  66. with codecs.open(file_found, "w", encoding="utf8") as f_found:
  67. with codecs.open(file_notfound, "w", encoding="utf8") as f_notfound:
  68. while (True):
  69. line = f.readline().strip()
  70. if not line:
  71. break
  72. entity = re.sub(")", ")", re.sub("(", "(", line))
  73. if re.search("公司$", entity):
  74. _ner = bilstm.ner(entity)[0]
  75. if len(_ner) == 1 and _ner[0][3] == entity:
  76. f_found.write(entity + "\n")
  77. else:
  78. f_notfound.write(entity + "\n")
  79. def cleanEntity():
  80. source_file = "C:\\Users\\User\\Desktop\\notcleanedEntity.tsv"
  81. temp_file = "C:\\Users\\User\\Desktop\\temp.tsv"
  82. set_cleanedEntity = set()
  83. set_notcleanedEntity = set()
  84. with codecs.open(source_file, "r", encoding="utf8") as f_nce:
  85. while (True):
  86. line = f_nce.readline().strip()
  87. if not line:
  88. break
  89. entity = re.sub('["\s]', "", line)
  90. f_1 = list(re.finditer("公司", entity))
  91. f_2 = list(re.finditer("[支分]公司", entity))
  92. # if len(f_1)==2 and len(f_2)==1 and re.search("[原;;.。、\|,,]",entity[f_1[0].span()[1]:f_1[1].span()[0]]) is None:
  93. if re.search("br|/", entity) is not None:
  94. # f_ce.write(entity+"\n")
  95. set_cleanedEntity.add(entity)
  96. else:
  97. set_notcleanedEntity.add(entity)
  98. list_cleanedEntity = list(set_cleanedEntity)
  99. list_cleanedEntity.sort(key=lambda x: len(x))
  100. list_notcleanedEntity = list(set_notcleanedEntity)
  101. list_notcleanedEntity.sort(key=lambda x: len(x))
  102. with codecs.open(temp_file, "w", encoding="utf8") as f_ce:
  103. with codecs.open(source_file, "w", encoding="utf8") as f_nce:
  104. for item in list_cleanedEntity:
  105. f_ce.write(item + "\n");
  106. for item in list_notcleanedEntity:
  107. f_nce.write(item + "\n")
  108. from urllib import parse
  109. if __name__ == "__main__":
  110. '''
  111. path_add = "0-12/"
  112. path = 'model/'+path_add+'model.ckpt'
  113. bilstm = BiLSTM().restore(path)
  114. '''
  115. bertCrf = BertCRF().restore()
  116. text = '小册子一批采购计划一、采购人:广州市比地数据科技有限公司,二、采购项目编号:'
  117. print(bertCrf.ner(text))
  118. # dealNotFoundEntity()
  119. pass
  120. '''
  121. cleanEntity()
  122. '''