Browse Source

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

 Conflicts:
	BiddingKG/dl/interface/predictor.py
	BiddingKG/dl_dev/test/test4.py
znj 1 năm trước cách đây
mục cha
commit
8ca12cdad3

+ 20 - 0
BiddingKG/dl/common/Utils.py

@@ -572,6 +572,26 @@ def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag
     #print(result)
     return result
 
+def get_context(sentence_text, begin_index, end_index, size=20, center_include=False):
+    '''
+    返回实体上下文信息
+    :param sentence_text: 句子文本
+    :param begin_index: 实体字开始位置
+    :param end_index: 实体字结束位置
+    :param size: 字偏移量
+    :param center_include:
+    :return:
+    '''
+    result = []
+    begin = begin_index - size if begin_index>size else 0
+    end = end_index + size
+    result.append(sentence_text[begin: begin_index])
+    if center_include:
+        result.append(sentence_text[begin_index: end_index])
+    result.append(sentence_text[end_index: end])
+    return result
+
+
 #根据规则补全编号或名称两边的符号
 def fitDataByRule(data):
     symbol_dict = {"(":")",

+ 3 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -2210,11 +2210,12 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
-        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
+        article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
+        ser = re.search('(采购|招标|比选)人(名称)?/(采购|招标|比选)?代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
 
-        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25})[,。]', article_processed)
+        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)?代理机构(名称)?:(?P<tenderee>[\w()]{4,25})[,。/]', article_processed)
         if ser2:
             article_processed = article_processed.replace(ser2.group(0), '采购人名称:%s,采购代理机构名称:,' % (
             ser2.group('tenderee')))

+ 26 - 1
BiddingKG/dl/interface/extract.py

@@ -62,6 +62,7 @@ def extractCount(extract_dict):
     bidding_budget = ""
     win_tenderer = ""
     win_bid_price = ""
+    linklist_count = 0
     for _key in dict_pack.keys():
         if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
             extract_count += 1
@@ -100,6 +101,13 @@ def extractCount(extract_dict):
                                 win_bid_price = str(float(_role["role_money"]["money"]))
                 if _role["role_name"]=="agency":
                     agency = _role["role_text"]
+                linklist = _role.get("linklist",[])
+                for link in linklist:
+                    for l in link:
+                        if l!="":
+                            linklist_count += 1
+
+    extract_count += linklist_count//2
 
     if project_code!="":
         extract_count += 1
@@ -198,6 +206,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
+    # roles_l = get_role_context(doc_id, list_sentences, list_entitys)
+    # return roles_l
+
     # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
     # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
     # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
@@ -329,7 +340,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-07-04'}
+    version_date = {'version_date': '2023-09-13'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''
@@ -382,6 +393,20 @@ def get_ent_context(list_sentences, list_entitys):
                 rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
     return '\n'.join(rs_list)
 
+def get_role_context(docid, list_sentences, list_entitys):
+    rs_list = []
+    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            if _entity.entity_type in ['org', 'company']:
+                sentence = sentences[_entity.sentence_index]
+                # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
+                #                    center_include=False, word_flag=True, text=_entity.entity_text)
+                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=20, center_include=False)
+                rs_list.append((docid, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
+                _entity.entity_text, _span[1]))
+    return rs_list
+
 if __name__=="__main__":
     import pandas as pd
     t1 = time.time()

+ 5 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -3460,15 +3460,17 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
             content += attachment
     else:
         content = list_articles[0].content
-    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
         if total_product_money>0 and total_product_money<5000000000:
             for value in prem[0]['prem'].values():
+                ree_money = float(value['tendereeMoney'])
                 for l in value['roleList']:
                     try:
                         # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
                         #     l[2] = total_product_money
                         #     log('修改中标金额为所有产品总金额')
-                        if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
+                        # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
                             l["role_money"]['money'] = total_product_money
                             # print('修改中标金额为所有产品总金额')
                     except Exception as e:
@@ -3522,7 +3524,7 @@ def limit_maximum_amount(dic, list_entity):
         else:
             maximum_amount = 50000000000
             minximum_amount = 500
-    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text) or category in ['零售批发']:
+    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text):
         # print('商品采购限额')
         maximum_amount = 80000000
         minximum_amount = 10

BIN
BiddingKG/dl/interface/header_set.pkl


+ 16 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -80,6 +80,22 @@ class Model_role_classify_word():
         _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
         # print(_encode_span)
         return _encode_span
+
+    def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
+        '''
+        上下文数字化,使用字偏移
+        :param sentence_text: 句子文本
+        :param begin_index: 实体字开始位置
+        :param end_index: 实体字结束位置
+        :param size: 字偏移量
+        :param kwargs:
+        :return:
+        '''
+        _span = get_context(sentence_text, begin_index, end_index,size=size, center_include=False)  # size=12 center_include=True
+        # print(_span)
+        _encode_span = encodeInput(_span, word_len=20, word_flag=True, userFool=False)  # word_len=20
+        # print(_encode_span)
+        return _encode_span
     
     def predict(self,x):
         x = np.transpose(np.array(x),(1,0,2))

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 295 - 165
BiddingKG/dl/interface/predictor.py


+ 10 - 9
BiddingKG/dl/metrics/extractMetric.py

@@ -257,7 +257,8 @@ class ExtractMetric():
         print(metrics)
 
     def extractFromInterface(self,content):
-        return json.loads(test("",content))
+        _json = test("",content)
+        return json.loads(_json)
 
     def getDiff(self,_inter,_inter2):
         _dict = {}
@@ -310,18 +311,18 @@ class ExtractMetric():
             if float(v.get("tendereeMoney",0))>0:
                 dict_project["%s_inter2"%("tendereeMoney")] = [float(v.get("tendereeMoney"))]
             for _role in v.get("roleList",[]):
-                dict_project["%s_inter2"%_role[0]] = [_role[1]]
-                if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]:
-                    if float(_role[2])>0:
-                        dict_project["%s_money_inter2"%_role[0]] = [float(_role[2])]
-                for item in _role[3]:
+                dict_project["%s_inter2"%_role.get("role_type")] = [_role.get("role_text")]
+                if _role.get("role_type") in ["win_tenderer","second_tenderer","third_tenderer"]:
+                    if float(_role.get("role_money").get("money",0))>0:
+                        dict_project["%s_money_inter2"%_role.get("role_type")] = [float(_role.get("role_money").get("money",0))]
+                for item in _role.get("linklist"):
                     _person = item[0]
                     _phone = item[1]
                     if _person=="" or _phone=="":
                         continue
-                    if "%s_person_inter2"%_role[0] not in dict_project:
-                        dict_project["%s_person_inter2"%_role[0]] = []
-                    dict_project["%s_person_inter2"%_role[0]].append("%s-%s"%(_role[1],_person))
+                    if "%s_person_inter2"%_role.get("role_type") not in dict_project:
+                        dict_project["%s_person_inter2"%_role.get("role_type")] = []
+                    dict_project["%s_person_inter2"%_role.get("role_type")].append("%s-%s"%(_role.get("role_text"),_person))
                     if "person_phone_inter2" not in dict_project:
                         dict_project["person_phone_inter2"] = []
                     dict_project["person_phone_inter2"].append("%s-%s"%(_person,_phone))

+ 3 - 0
BiddingKG/dl/test/compare1.txt

@@ -0,0 +1,3 @@
+
+
+import json

+ 3 - 3
BiddingKG/dl/time/re_servicetime.py

@@ -20,9 +20,9 @@ TEST_MODE = False
 
 before = '(?P<before>' \
          '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
-         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
          '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
-         '|交货时间|工期|质保期' \
+         '|交货时间|工期' \
          '|保洁期限|维保期|管理年限|工期承诺|(服务|合同|施工|实施|工程|设计)(年限|期限|周期|期:)' \
          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
          '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
@@ -62,7 +62,7 @@ before2 = '(?P<before2>' \
         # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
 
 before3 = '(?P<before3>' \
-          '([\((](日历天|施工时间)[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
+          ',?([\((](日历天|施工时间)[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
           ')'
 
 before4 = '(?P<before4>' \

+ 16 - 3
BiddingKG/dl_dev/test/test4.py

@@ -22,6 +22,9 @@ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(le
 import json
 import random
 
+from ipywidgets import Layout
+
+
 session = requests.Session()
 
 def test(name,content,_url=None):
@@ -106,15 +109,25 @@ def run_one():
     # '''
     print("start")
     _time1 = time.time()
-    print(predict("12", text,"市属公立医院医用耗材及其他设备招标结果每两周公示(10.16-10.31) "))
+    print(predict("12", text,""))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-a)
     # a = time.time()
-    # print(predict("12", text,""))
+    # print(predict("12", text,"打印机"))
     # print("takes", time.time() - a)
     pass
 
+def test_ner():
+    import fool
+    _text = '''
+    一、 *采购人名称:中共黄山市黄山区委统一战线工作部
+
+二、 *履约供应商名称:黄山区睿智办公设备销售中心
+    '''
+    print(fool.ner(_text))
+
 if __name__=="__main__":
     # presure_test()
-    run_one()
+    # run_one()
+    test_ner()

+ 1 - 0
BiddingKG/readme/start.md

@@ -9,6 +9,7 @@ cd /data/python
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
 nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup gunicorn --workers 3 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 192.168.2.102:15030 run_extract_server:app > extract.log 2>&1 &
 #nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
 
 #19022启动要素提取接口

+ 3 - 2
BiddingKG/run_extract_server.py

@@ -81,7 +81,7 @@ def run_thread(data,list_result):
     web_source_no = data.get("web_source_no","")
     web_source_name = data.get("web_source_name","")
     original_docchannel = data.get("original_docchannel","")
-    print("web_source_name:",web_source_name)
+    # print("web_source_name:",web_source_name)
     is_fail = False
     try:
         if _content!="":
@@ -98,7 +98,7 @@ def run_thread(data,list_result):
     # 以json形式返回结果
     #_resp = json.dumps(data_res,cls=MyEncoder)
     #log(str(data["flag"])+str(data))
-    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+    # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
     list_result.append(data_res)
     if is_fail:
         list_result.append(is_fail)
@@ -170,6 +170,7 @@ def start_with_tornado(port,process_num):
     from tornado.httpserver import HTTPServer
     from tornado.ioloop import IOLoop
 
+    print("import ")
     http_server = HTTPServer(WSGIContainer(app))
     # http_server.listen(port) #shortcut for bind and start
     http_server.bind(port)

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác