123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- '''
- Created on 2019年1月3日
- @author: User
- '''
- import psycopg2
- import re
- import pandas as pd
- from BiddingKG.dl.common.Utils import *
- #取编号名称数据到excel中
- def getCodeNameToExcel():
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select A.id,A.content,B.projectcode,B.projectname from articles_processed A,project B where A.id=B.doc_id "
-
- cursor.execute(sql)
-
- rows = cursor.fetchall()
-
- list_doc_id = []
- list_codenameflag = []
- list_codename_before = []
- list_codename = []
- list_codename_after = []
- context_len = 10
- for row in rows:
- doc_id = row[0]
- contents = row[1]
- codes = row[2]
- names = row[3]
- for content in re.split("。",contents):
- for code in re.split("[;;]",codes):
- code = code.strip()
- if code=="":
- continue
- for index in findAllIndex(code, content):
- list_doc_id.append(doc_id)
- list_codenameflag.append("code")
- if index>context_len:
- list_codename_before.append(content[index-context_len:index])
- else:
- list_codename_before.append(content[0:index])
- list_codename.append(code)
- if index<len(content)-context_len:
- list_codename_after.append(content[index+len(code):index+len(code)+context_len])
- else:
- list_codename_after.append(content[index+len(code):len(content)])
- #print(list_codename)
- for code in re.split("[;;]",names):
- code = code.strip()
- if code=="":
- continue
- for index in findAllIndex(code, content):
- list_doc_id.append(doc_id)
- list_codenameflag.append("name")
- if index>context_len:
- list_codename_before.append(content[index-context_len:index])
- else:
- list_codename_before.append(content[0:index])
- list_codename.append(code)
- if index<len(content)-context_len:
- list_codename_after.append(content[index+len(code):index+len(code)+context_len])
- else:
- list_codename_after.append(content[index+len(code):len(content)])
- for i in range(len(list_codename_after)):
- if list_codenameflag[i]=="code" and len(list_codename_after[i])>0 and re.search("\d",list_codename_after[i][0]) is not None:
- #print(i+1,list_codename[i]+list_codename_after[i])
- pass
-
- list_search = []
- for i in range(len(list_codename_after)):
- if list_codenameflag[i]=="name" and len(list_codename_after[i])>3:
- s = re.search("^项目|^施工|^监理|^[0-9A-Za-z一二三四五六七八九十]{1,2}标段|^系统|^设计|^服务",list_codename_after[i][0:4])
- if s is not None:
- list_search.append(list_codename[i]+s.group())
- else:
- list_search.append("")
- else:
- list_search.append("")
-
- dataframe = pd.DataFrame({'id':list_doc_id,'flag':list_codenameflag,"before":list_codename_before,"value":list_codename,"after":list_codename_after,"search":list_search})
- columns = ['id','flag',"before","value","after","search"]
- dataframe.to_excel("handlabel/codenames.xls",index=False,header=False,encoding="utf8",columns=columns)
-
- #保存结果到数据库
- def persistenceToDB():
-
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- file = "handlabel/codenames_tailCompleted.xls"
- pds = pd.read_excel(file)
- list_doc_id = pds['doc_id']
- list_flag = pds['flag']
- list_before = pds['before']
- list_codename = pds['codename']
- list_after = pds['after']
- list_codechange = pds['codechange']
- list_namechange = pds['namechange']
-
- dic = dict()
- for i in range(len(list_doc_id)):
- if list_doc_id[i] not in dic.keys():
- dic[list_doc_id[i]] = [set(),set()]
- if list_flag[i]=="code":
- if str(list_codechange[i])!="nan":
- dic[list_doc_id[i]][0].add(str(list_codechange[i]))
- else:
- if str(list_codename[i])!="nan":
- dic[list_doc_id[i]][0].add(str(list_codename[i]))
- else:
- if str(list_namechange[i])!="nan":
- dic[list_doc_id[i]][1].add(str(list_namechange[i]))
- else:
- if str(list_codename[i])!="nan":
- dic[list_doc_id[i]][1].add(str(list_codename[i]))
- for item in dic.keys():
- sql = " insert into project_relabel values('"+str(item)+"','"+";".join(list(dic[item][0]))+"','"+";".join(list(dic[item][1]))+"')"
- cursor.execute(sql)
- '''
- #确认无误之后替换
- update project set projectcode=(select projectcode from project_relabel A where A.doc_id=project.doc_id) where
- exists(select 1 from project_relabel B where B.doc_id=project.doc_id and B.projectcode!='')
-
- update project set projectname=(select projectname from project_relabel A where A.doc_id=project.doc_id) where
- exists(select 1 from project_relabel B where B.doc_id=project.doc_id and B.projectname!='')
- '''
- conn.commit()
- conn.close()
- if __name__=="__main__":
- #getCodeNameToExcel()
- persistenceToDB()
|