Bläddra i källkod

修复某些中标表达及字符转换造成的错误

lsm 1 år sedan
förälder
incheckning
7b574bc244

+ 2 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -2208,6 +2208,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
         article_processed = re.sub('(不?含(可抵扣增值|\w{,8})税)', '', article_processed)    # 120637247 投标报价(元),(含可抵扣增值税):277,560.00。
         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
+        article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
+        article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))

+ 2 - 2
BiddingKG/dl/interface/extract.py

@@ -110,7 +110,7 @@ def extractCount(extract_dict):
 # 字符编码标准化
 def str_normalize(text):
     # time1 = time.time()
-    cn_punctuation = "¥,。:;{}!?()"
+    cn_punctuation = "¥,。:;{}!?()"
     text_split = re.split("([{}])+".format(cn_punctuation),text)
     # print(text_split)
     new_text = ""
@@ -327,7 +327,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-07-03'}
+    version_date = {'version_date': '2023-07-04'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 10 - 6
BiddingKG/dl/interface/predictor.py

@@ -1311,7 +1311,7 @@ class RoleRulePredictor():
         self.pattern_winTenderer_left = "(?P<winTenderer_left>" \
                "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致)[::是为]+$" \
-               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$" \
+               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
                "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)"  # 承办单位:不作为中标 83914772
         self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w0>" \
@@ -2241,7 +2241,7 @@ class MoneyGrade():
                     if ser:
                         groupdict = pattern.split('>')[0].replace('(?P<', '')
                         _role, _direct, _prob = groupdict.split('_')
-                        if re.search('单价', context[-4:]) or float(entity.entity_text):
+                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100:
                             _prob = 6
                         _label = role2id.get(_role)
                         if _label != entity.label:
@@ -2249,16 +2249,16 @@ class MoneyGrade():
                         _prob = int(_prob) * 0.1
                         # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values)
                         if in_att:
-                            _prob = _prob - 0.2
+                            _prob = max(0.5, _prob - 0.2)
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0
                         # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
                         break
                 if not_found and entity.values[entity.label] > min_prob:
-                    if re.search('单价', context[-4:]) or float(entity.entity_text)<100:
+                    if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100:
                         _prob = 0.6
                     elif in_att:
-                        _prob = min_prob - 0.1
+                        _prob = max(0.5, min_prob - 0.1)
                     else:
                         _prob = min_prob
                     # _prob = min_prob - 0.1 if in_att else min_prob
@@ -4963,7 +4963,11 @@ class TableTag2List():
                     # insert into self._output
                     try:
                         if text_process != None:
-                            text = [re.sub('\xa0','',text_process(cell,final=False)),0]
+                            # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
+                            td_text = re.sub('\xa0', '', text_process(cell, final=False))
+                            if td_text == "":
+                                td_text = ' '
+                            text = [td_text,0]
                         else:
                             text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
                             text = re.sub('\s', '', text)[:200] # 只需取前200字即可