1 năm trước cách đây · 8ca12cdad3
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -572,6 +572,26 @@ def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag
 
				     #print(result)
			
 
				     return result
			
 
				 
			
 
				+def get_context(sentence_text, begin_index, end_index, size=20, center_include=False):
			
 
				+    '''
			
 
				+    返回实体上下文信息
			
 
				+    :param sentence_text: 句子文本
			
 
				+    :param begin_index: 实体字开始位置
			
 
				+    :param end_index: 实体字结束位置
			
 
				+    :param size: 字偏移量
			
 
				+    :param center_include:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    result = []
			
 
				+    begin = begin_index - size if begin_index>size else 0
			
 
				+    end = end_index + size
			
 
				+    result.append(sentence_text[begin: begin_index])
			
 
				+    if center_include:
			
 
				+        result.append(sentence_text[begin_index: end_index])
			
 
				+    result.append(sentence_text[end_index: end])
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				 #根据规则补全编号或名称两边的符号
			
 
				 def fitDataByRule(data):
			
 
				     symbol_dict = {"(":")",
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2210,11 +2210,12 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式：名称)', '供应商名称', article_processed)  # 18889217, 84422177
			
 
				         article_processed = re.sub('，最高有效报价者：', '，中标人名称：', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
			
 
				         article_processed = re.sub('，最高有效报价：', '，投标报价：', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
			
 
				-        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?：(?P<tenderee>[\w（）]{4,25}(/[\w（）]{4,25})?)/(?P<agency>[\w（）]{4,25})[，。]', article_processed)
			
 
				+        article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
			
 
				+        ser = re.search('(采购|招标|比选)人(名称)?/(采购|招标|比选)?代理机构(名称)?：(?P<tenderee>[\w（）]{4,25}(/[\w（）]{4,25})?)/(?P<agency>[\w（）]{4,25})[，。]', article_processed)
			
 
				         if ser:
			
 
				             article_processed = article_processed.replace(ser.group(0), '采购人名称：%s，采购代理机构名称：%s，' % (ser.group('tenderee'), ser.group('agency')))
			
 
				 
			
 
				-        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?：(?P<tenderee>[\w（）]{4,25})[，。]', article_processed)
			
 
				+        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)?代理机构(名称)?：(?P<tenderee>[\w（）]{4,25})[，。/]', article_processed)
			
 
				         if ser2:
			
 
				             article_processed = article_processed.replace(ser2.group(0), '采购人名称：%s，采购代理机构名称：，' % (
			
 
				             ser2.group('tenderee')))
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -62,6 +62,7 @@ def extractCount(extract_dict):
 
				     bidding_budget = ""
			
 
				     win_tenderer = ""
			
 
				     win_bid_price = ""
			
 
				+    linklist_count = 0
			
 
				     for _key in dict_pack.keys():
			
 
				         if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
			
 
				             extract_count += 1
			
@@ -100,6 +101,13 @@ def extractCount(extract_dict):
 
				                                 win_bid_price = str(float(_role["role_money"]["money"]))
			
 
				                 if _role["role_name"]=="agency":
			
 
				                     agency = _role["role_text"]
			
 
				+                linklist = _role.get("linklist",[])
			
 
				+                for link in linklist:
			
 
				+                    for l in link:
			
 
				+                        if l!="":
			
 
				+                            linklist_count += 1
			
 
				+
			
 
				+    extract_count += linklist_count//2
			
 
				 
			
 
				     if project_code!="":
			
 
				         extract_count += 1
			
@@ -198,6 +206,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     log("get prem done of doc_id%s"%(doc_id))
			
 
				     cost_time["prem"] = round(time.time()-start_time,2)
			
 
				 
			
 
				+    # roles_l = get_role_context(doc_id, list_sentences, list_entitys)
			
 
				+    # return roles_l
			
 
				+
			
 
				     # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
			
 
				     # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
			
 
				     # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因，产品已加入到Entity类
			
@@ -329,7 +340,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2023-07-04'}
			
 
				+    version_date = {'version_date': '2023-09-13'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
			
 
				 
			
 
				     '''最终检查修正招标、中标金额'''
			
@@ -382,6 +393,20 @@ def get_ent_context(list_sentences, list_entitys):
 
				                 rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
			
 
				     return '\n'.join(rs_list)
			
 
				 
			
 
				+def get_role_context(docid, list_sentences, list_entitys):
			
 
				+    rs_list = []
			
 
				+    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
			
 
				+    for list_entity in list_entitys:
			
 
				+        for _entity in list_entity:
			
 
				+            if _entity.entity_type in ['org', 'company']:
			
 
				+                sentence = sentences[_entity.sentence_index]
			
 
				+                # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
			
 
				+                #                    center_include=False, word_flag=True, text=_entity.entity_text)
			
 
				+                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=20, center_include=False)
			
 
				+                rs_list.append((docid, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
			
 
				+                _entity.entity_text, _span[1]))
			
 
				+    return rs_list
			
 
				+
			
 
				 if __name__=="__main__":
			
 
				     import pandas as pd
			
 
				     t1 = time.time()
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -3460,15 +3460,17 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
 
				             content += attachment
			
 
				     else:
			
 
				         content = list_articles[0].content
			
 
				-    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同)）?(总?金额|[报总]?价)：', content) == None: # 只有一个中标角色且没有明确中标金额表达的
			
 
				+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标)）?(总?金额|[报总]?价)：', content) == None: # 只有一个中标角色且没有明确中标金额表达的
			
 
				         if total_product_money>0 and total_product_money<5000000000:
			
 
				             for value in prem[0]['prem'].values():
			
 
				+                ree_money = float(value['tendereeMoney'])
			
 
				                 for l in value['roleList']:
			
 
				                     try:
			
 
				                         # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
			
 
				                         #     l[2] = total_product_money
			
 
				                         #     log('修改中标金额为所有产品总金额')
			
 
				-                        if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
			
 
				+                        # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
			
 
				+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
			
 
				                             l["role_money"]['money'] = total_product_money
			
 
				                             # print('修改中标金额为所有产品总金额')
			
 
				                     except Exception as e:
			
@@ -3522,7 +3524,7 @@ def limit_maximum_amount(dic, list_entity):
 
				         else:
			
 
				             maximum_amount = 50000000000
			
 
				             minximum_amount = 500
			
 
				-    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text) or category in ['零售批发']:
			
 
				+    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text):
			
 
				         # print('商品采购限额')
			
 
				         maximum_amount = 80000000
			
 
				         minximum_amount = 10
			
--- a/BiddingKG/dl/interface/header_set.pkl
+++ b/BiddingKG/dl/interface/header_set.pkl
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -80,6 +80,22 @@ class Model_role_classify_word():
 
				         _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
			
 
				         # print(_encode_span)
			
 
				         return _encode_span
			
 
				+
			
 
				+    def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
			
 
				+        '''
			
 
				+        上下文数字化，使用字偏移
			
 
				+        :param sentence_text: 句子文本
			
 
				+        :param begin_index: 实体字开始位置
			
 
				+        :param end_index: 实体字结束位置
			
 
				+        :param size: 字偏移量
			
 
				+        :param kwargs:
			
 
				+        :return:
			
 
				+        '''
			
 
				+        _span = get_context(sentence_text, begin_index, end_index,size=size, center_include=False)  # size=12 center_include=True
			
 
				+        # print(_span)
			
 
				+        _encode_span = encodeInput(_span, word_len=20, word_flag=True, userFool=False)  # word_len=20
			
 
				+        # print(_encode_span)
			
 
				+        return _encode_span
			
 
				     
			
 
				     def predict(self,x):
			
 
				         x = np.transpose(np.array(x),(1,0,2))
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
--- a/BiddingKG/dl/metrics/extractMetric.py
+++ b/BiddingKG/dl/metrics/extractMetric.py
@@ -257,7 +257,8 @@ class ExtractMetric():
 
				         print(metrics)
			
 
				 
			
 
				     def extractFromInterface(self,content):
			
 
				-        return json.loads(test("",content))
			
 
				+        _json = test("",content)
			
 
				+        return json.loads(_json)
			
 
				 
			
 
				     def getDiff(self,_inter,_inter2):
			
 
				         _dict = {}
			
@@ -310,18 +311,18 @@ class ExtractMetric():
 
				             if float(v.get("tendereeMoney",0))>0:
			
 
				                 dict_project["%s_inter2"%("tendereeMoney")] = [float(v.get("tendereeMoney"))]
			
 
				             for _role in v.get("roleList",[]):
			
 
				-                dict_project["%s_inter2"%_role[0]] = [_role[1]]
			
 
				-                if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]:
			
 
				-                    if float(_role[2])>0:
			
 
				-                        dict_project["%s_money_inter2"%_role[0]] = [float(_role[2])]
			
 
				-                for item in _role[3]:
			
 
				+                dict_project["%s_inter2"%_role.get("role_type")] = [_role.get("role_text")]
			
 
				+                if _role.get("role_type") in ["win_tenderer","second_tenderer","third_tenderer"]:
			
 
				+                    if float(_role.get("role_money").get("money",0))>0:
			
 
				+                        dict_project["%s_money_inter2"%_role.get("role_type")] = [float(_role.get("role_money").get("money",0))]
			
 
				+                for item in _role.get("linklist"):
			
 
				                     _person = item[0]
			
 
				                     _phone = item[1]
			
 
				                     if _person=="" or _phone=="":
			
 
				                         continue
			
 
				-                    if "%s_person_inter2"%_role[0] not in dict_project:
			
 
				-                        dict_project["%s_person_inter2"%_role[0]] = []
			
 
				-                    dict_project["%s_person_inter2"%_role[0]].append("%s-%s"%(_role[1],_person))
			
 
				+                    if "%s_person_inter2"%_role.get("role_type") not in dict_project:
			
 
				+                        dict_project["%s_person_inter2"%_role.get("role_type")] = []
			
 
				+                    dict_project["%s_person_inter2"%_role.get("role_type")].append("%s-%s"%(_role.get("role_text"),_person))
			
 
				                     if "person_phone_inter2" not in dict_project:
			
 
				                         dict_project["person_phone_inter2"] = []
			
 
				                     dict_project["person_phone_inter2"].append("%s-%s"%(_person,_phone))
			
--- a/BiddingKG/dl/test/compare1.txt
+++ b/BiddingKG/dl/test/compare1.txt
@@ -0,0 +1,3 @@
 
				+
			
 
				+
			
 
				+import json
			
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py
@@ -20,9 +20,9 @@ TEST_MODE = False
 
				 
			
 
				 before = '(?P<before>' \
			
 
				          '合同期限|工期/交货期/服务期|工期，|工期\(交货期\)|合格工期|服务期限|工期' \
			
 
				-         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
			
 
				+         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
			
 
				          '|合格工期|计划工期\(服务期\)|服务期|服务，期|交货\(完工\)时间|交付\(服务、完工\)时间' \
			
 
				-         '|交货时间|工期|质保期' \
			
 
				+         '|交货时间|工期' \
			
 
				          '|保洁期限|维保期|管理年限|工期承诺|(服务|合同|施工|实施|工程|设计)(年限|期限|周期|期：)' \
			
 
				          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
			
 
				          '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
			
@@ -62,7 +62,7 @@ before2 = '(?P<before2>' \
 
				         # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
			
 
				 
			
 
				 before3 = '(?P<before3>' \
			
 
				-          '([\(（](日历天|施工时间)[\)）]|[\(（]天[\)）]|[\(（]年[\)）]|[\(（]月[\)）])?' \
			
 
				+          '，?([\(（](日历天|施工时间)[\)）]|[\(（]天[\)）]|[\(（]年[\)）]|[\(（]月[\)）])?' \
			
 
				           ')'
			
 
				 
			
 
				 before4 = '(?P<before4>' \
			
--- a/BiddingKG/dl_dev/test/test4.py
+++ b/BiddingKG/dl_dev/test/test4.py
@@ -22,6 +22,9 @@ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(le
 
				 import json
			
 
				 import random
			
 
				 
			
 
				+from ipywidgets import Layout
			
 
				+
			
 
				+
			
 
				 session = requests.Session()
			
 
				 
			
 
				 def test(name,content,_url=None):
			
@@ -106,15 +109,25 @@ def run_one():
 
				     # '''
			
 
				     print("start")
			
 
				     _time1 = time.time()
			
 
				-    print(predict("12", text,"市属公立医院医用耗材及其他设备招标结果每两周公示(10.16-10.31) "))
			
 
				+    print(predict("12", text,""))
			
 
				     # test(12,content)
			
 
				     # test(12,text)
			
 
				     print("takes",time.time()-a)
			
 
				     # a = time.time()
			
 
				-    # print(predict("12", text,""))
			
 
				+    # print(predict("12", text,"打印机"))
			
 
				     # print("takes", time.time() - a)
			
 
				     pass
			
 
				 
			
 
				+def test_ner():
			
 
				+    import fool
			
 
				+    _text = '''
			
 
				+    一、 *采购人名称：中共黄山市黄山区委统一战线工作部
			
 
				+
			
 
				+二、 *履约供应商名称：黄山区睿智办公设备销售中心
			
 
				+    '''
			
 
				+    print(fool.ner(_text))
			
 
				+
			
 
				 if __name__=="__main__":
			
 
				     # presure_test()
			
 
				-    run_one()
			
 
				+    # run_one()
			
 
				+    test_ner()
			
--- a/BiddingKG/readme/start.md
+++ b/BiddingKG/readme/start.md
@@ -9,6 +9,7 @@ cd /data/python
 
				 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
			
 
				 #启动接口
			
 
				 nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
			
 
				+nohup gunicorn --workers 3 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 192.168.2.102:15030 run_extract_server:app > extract.log 2>&1 &
			
 
				 #nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
			
 
				 
			
 
				 #19022启动要素提取接口
			
--- a/BiddingKG/run_extract_server.py
+++ b/BiddingKG/run_extract_server.py
@@ -81,7 +81,7 @@ def run_thread(data,list_result):
 
				     web_source_no = data.get("web_source_no","")
			
 
				     web_source_name = data.get("web_source_name","")
			
 
				     original_docchannel = data.get("original_docchannel","")
			
 
				-    print("web_source_name:",web_source_name)
			
 
				+    # print("web_source_name:",web_source_name)
			
 
				     is_fail = False
			
 
				     try:
			
 
				         if _content!="":
			
@@ -98,7 +98,7 @@ def run_thread(data,list_result):
 
				     # 以json形式返回结果
			
 
				     #_resp = json.dumps(data_res,cls=MyEncoder)
			
 
				     #log(str(data["flag"])+str(data))
			
 
				-    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
			
 
				+    # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
			
 
				     list_result.append(data_res)
			
 
				     if is_fail:
			
 
				         list_result.append(is_fail)
			
@@ -170,6 +170,7 @@ def start_with_tornado(port,process_num):
 
				     from tornado.httpserver import HTTPServer
			
 
				     from tornado.ioloop import IOLoop
			
 
				 
			
 
				+    print("import ")
			
 
				     http_server = HTTPServer(WSGIContainer(app))
			
 
				     # http_server.listen(port) #shortcut for bind and start
			
 
				     http_server.bind(port)