CodeNameRelabel.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. '''
  2. Created on 2019年1月3日
  3. @author: User
  4. '''
  5. import psycopg2
  6. import re
  7. import pandas as pd
  8. from BiddingKG.dl.common.Utils import *
  9. #取编号名称数据到excel中
  10. def getCodeNameToExcel():
  11. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  12. cursor = conn.cursor()
  13. sql = " select A.id,A.content,B.projectcode,B.projectname from articles_processed A,project B where A.id=B.doc_id "
  14. cursor.execute(sql)
  15. rows = cursor.fetchall()
  16. list_doc_id = []
  17. list_codenameflag = []
  18. list_codename_before = []
  19. list_codename = []
  20. list_codename_after = []
  21. context_len = 10
  22. for row in rows:
  23. doc_id = row[0]
  24. contents = row[1]
  25. codes = row[2]
  26. names = row[3]
  27. for content in re.split("。",contents):
  28. for code in re.split("[;;]",codes):
  29. code = code.strip()
  30. if code=="":
  31. continue
  32. for index in findAllIndex(code, content):
  33. list_doc_id.append(doc_id)
  34. list_codenameflag.append("code")
  35. if index>context_len:
  36. list_codename_before.append(content[index-context_len:index])
  37. else:
  38. list_codename_before.append(content[0:index])
  39. list_codename.append(code)
  40. if index<len(content)-context_len:
  41. list_codename_after.append(content[index+len(code):index+len(code)+context_len])
  42. else:
  43. list_codename_after.append(content[index+len(code):len(content)])
  44. #print(list_codename)
  45. for code in re.split("[;;]",names):
  46. code = code.strip()
  47. if code=="":
  48. continue
  49. for index in findAllIndex(code, content):
  50. list_doc_id.append(doc_id)
  51. list_codenameflag.append("name")
  52. if index>context_len:
  53. list_codename_before.append(content[index-context_len:index])
  54. else:
  55. list_codename_before.append(content[0:index])
  56. list_codename.append(code)
  57. if index<len(content)-context_len:
  58. list_codename_after.append(content[index+len(code):index+len(code)+context_len])
  59. else:
  60. list_codename_after.append(content[index+len(code):len(content)])
  61. for i in range(len(list_codename_after)):
  62. if list_codenameflag[i]=="code" and len(list_codename_after[i])>0 and re.search("\d",list_codename_after[i][0]) is not None:
  63. #print(i+1,list_codename[i]+list_codename_after[i])
  64. pass
  65. list_search = []
  66. for i in range(len(list_codename_after)):
  67. if list_codenameflag[i]=="name" and len(list_codename_after[i])>3:
  68. s = re.search("^项目|^施工|^监理|^[0-9A-Za-z一二三四五六七八九十]{1,2}标段|^系统|^设计|^服务",list_codename_after[i][0:4])
  69. if s is not None:
  70. list_search.append(list_codename[i]+s.group())
  71. else:
  72. list_search.append("")
  73. else:
  74. list_search.append("")
  75. dataframe = pd.DataFrame({'id':list_doc_id,'flag':list_codenameflag,"before":list_codename_before,"value":list_codename,"after":list_codename_after,"search":list_search})
  76. columns = ['id','flag',"before","value","after","search"]
  77. dataframe.to_excel("handlabel/codenames.xls",index=False,header=False,encoding="utf8",columns=columns)
  78. #保存结果到数据库
  79. def persistenceToDB():
  80. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  81. cursor = conn.cursor()
  82. file = "handlabel/codenames_tailCompleted.xls"
  83. pds = pd.read_excel(file)
  84. list_doc_id = pds['doc_id']
  85. list_flag = pds['flag']
  86. list_before = pds['before']
  87. list_codename = pds['codename']
  88. list_after = pds['after']
  89. list_codechange = pds['codechange']
  90. list_namechange = pds['namechange']
  91. dic = dict()
  92. for i in range(len(list_doc_id)):
  93. if list_doc_id[i] not in dic.keys():
  94. dic[list_doc_id[i]] = [set(),set()]
  95. if list_flag[i]=="code":
  96. if str(list_codechange[i])!="nan":
  97. dic[list_doc_id[i]][0].add(str(list_codechange[i]))
  98. else:
  99. if str(list_codename[i])!="nan":
  100. dic[list_doc_id[i]][0].add(str(list_codename[i]))
  101. else:
  102. if str(list_namechange[i])!="nan":
  103. dic[list_doc_id[i]][1].add(str(list_namechange[i]))
  104. else:
  105. if str(list_codename[i])!="nan":
  106. dic[list_doc_id[i]][1].add(str(list_codename[i]))
  107. for item in dic.keys():
  108. sql = " insert into project_relabel values('"+str(item)+"','"+";".join(list(dic[item][0]))+"','"+";".join(list(dic[item][1]))+"')"
  109. cursor.execute(sql)
  110. '''
  111. #确认无误之后替换
  112. update project set projectcode=(select projectcode from project_relabel A where A.doc_id=project.doc_id) where
  113. exists(select 1 from project_relabel B where B.doc_id=project.doc_id and B.projectcode!='')
  114. update project set projectname=(select projectname from project_relabel A where A.doc_id=project.doc_id) where
  115. exists(select 1 from project_relabel B where B.doc_id=project.doc_id and B.projectname!='')
  116. '''
  117. conn.commit()
  118. conn.close()
  119. if __name__=="__main__":
  120. #getCodeNameToExcel()
  121. persistenceToDB()