소스 검색

表格预处理修改;表格要素提取调整;包提取调整;角色金额规则调整;地区匹配调整

lsm 2 년 전
부모
커밋
a523758a25

+ 2 - 2
BiddingKG/dl/common/Utils.py

@@ -801,8 +801,8 @@ def uniform_package_name(package_name):
     name = ""
     if kw:
         name += kw.group(0)
-    if re.search('[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
-        _digit = re.search('[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
+    if re.search('^[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
+        _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
         name += _digit
     elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
         ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)

+ 33 - 6
BiddingKG/dl/interface/Preprocessing.py

@@ -15,7 +15,7 @@ sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath(".."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import *
-from BiddingKG.dl.interface.predictor import getPredictor
+from BiddingKG.dl.interface.predictor import getPredictor, TableTag2List
 from BiddingKG.dl.common.nerUtils import *
 from BiddingKG.dl.money.moneySource.ruleExtra import extract_moneySource
 from BiddingKG.dl.time.re_servicetime import extract_servicetime
@@ -128,6 +128,9 @@ def tableToText(soup):
         for item in inner_table:
             if len(item)>maxWidth:
                 maxWidth = len(item)
+        if maxWidth > 100:
+            # log('表格列数大于100,表格异常不做处理。')
+            return []
         for i in range(len(inner_table)):
             if len(inner_table[i])<maxWidth:
                 for j in range(maxWidth-len(inner_table[i])):
@@ -1046,10 +1049,35 @@ def tableToText(soup):
             if _td_len_list:
                 if len(list(set(_td_len_list))) >= 8 or max(_td_len_list) > 100:
                     return None
-        fixSpan(tbody)
-        inner_table = getTable(tbody)
+
+        # fixSpan(tbody)
+        # inner_table = getTable(tbody)
+        # inner_table = fixTable(inner_table)
+
+        table2list = TableTag2List()
+        inner_table = table2list.table2list(tbody, segment)
         inner_table = fixTable(inner_table)
+
+        if inner_table == []:
+            tbody.string = segment(tbody,final=False)
+            table_max_len = 30000
+            tbody.string = tbody.string[:table_max_len]
+            # log('异常表格直接取全文')
+            tbody.name = "turntable"
+            return None
+
         if len(inner_table)>0 and len(inner_table[0])>0:
+
+            for tr in inner_table:
+                for td in tr:
+                    if isinstance(td, str):
+                        tbody.string = segment(tbody,final=False)
+                        table_max_len = 30000
+                        tbody.string = tbody.string[:table_max_len]
+                        # log('异常表格,不做表格处理,直接取全文')
+                        tbody.name = "turntable"
+                        return None
+
             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
             #inner_table,head_list = setHead_inline(inner_table)
             # inner_table, head_list = setHead_initem(inner_table,pat_head)
@@ -1069,7 +1097,6 @@ def tableToText(soup):
             # for item in inner_table:
             #     print(item)
 
-
             tbody.string = getTableText(inner_table,head_list)
             table_max_len = 30000
             tbody.string = tbody.string[:table_max_len]
@@ -1311,12 +1338,12 @@ def segment(soup,final=True):
     # 感叹号替换为中文句号
     text = re.sub("(?<=[\u4e00-\u9fa5])[!!]|[!!](?=[\u4e00-\u9fa5])","。",text)
     #替换格式未识别的问号为" " ,update:2021/7/20
-    text = re.sub("[?\?]{2,}"," ",text)
+    text = re.sub("[?\?]{2,}|\n"," ",text)
 
 
     #替换"""为"“",否则导入deepdive出错
     # text = text.replace('"',"“").replace("\r","").replace("\n",",")
-    text = text.replace('"',"“").replace("\r","").replace("\n","")  #2022/1/4修复 非分段\n 替换为逗号造成 公司拆分 span \n南航\n上海\n分公司
+    text = text.replace('"',"“").replace("\r","").replace("\n","").replace("\\n","") #2022/1/4修复 非分段\n 替换为逗号造成 公司拆分 span \n南航\n上海\n分公司
     # print('==1',text)
     # text = re.sub("\s{4,}",",",text)
     # 解决公告中的" "空格替换问题

BIN
BiddingKG/dl/interface/district_dic.pkl


+ 2 - 1
BiddingKG/dl/interface/extract.py

@@ -254,7 +254,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic)
+    version_date = {'version_date': '2022-11-24'}
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment

+ 6 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -870,12 +870,15 @@ def getPackagesFromArticle(list_sentence, list_entity):
             tokens = list_sentence[i].tokens
             _names = []
             for iter in re.finditer(package_number_pattern, content):
-                if re.match('\d', iter.group(0)) and iter.end() < len(content) and content[
-                    iter.end()].isdigit():  # 排除2.10标段3 这种情况
+                # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
+                if re.match('\d', iter.group(0)) and re.search('\d.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
+                    # print('过滤掉错误包:', iter.group())
                     continue
                 if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
+                    # print('过滤掉错误包:', iter.group())
                     continue
-                elif iter.end()+1 < len(content) and  re.search('标准|包装', content[iter.start():iter.end()+1]):
+                elif iter.end()+2 < len(content) and  re.search('标准|标的物|标志|包装|划分', content[iter.start():iter.end()+2]):
+                    # print('过滤掉错误包:',iter.group())
                     continue
                 temp_package_number = uniform_package_name(iter.group(0))
                 True_package.add(temp_package_number)

BIN
BiddingKG/dl/interface/header_set.pkl


+ 76 - 22
BiddingKG/dl/interface/predictor.py

@@ -681,7 +681,9 @@ class PREMPredict():
             label = np.argmax(predict_y[i])
             values = predict_y[i]
             text = text_list[i]
-            if label == 2:
+            if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
+                label = 5
+            elif label == 2:
                 if re.search('中标单位和.{,25}签订合同', text):
                     label = 0
                     values[label] = 0.501
@@ -738,12 +740,13 @@ class PREMPredict():
             label = np.argmax(predict_y[i])
             values = predict_y[i]
             text = text_list[i]
-            if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
+            if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
+                label = 2
+            elif label == 1 and re.search('[::,。](总金额|总价|单价)', text):
                 values[label] = 0.49
             elif label ==0 and entity.notes in ["投资", "工程造价"]:
                 values[label] = 0.49
             elif label == 0 and re.search('最低限价', text):
-                label = 2
                 values[label] = 0.49
             entity.set_Money(label, values)
 
@@ -1202,7 +1205,7 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额")
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
@@ -1400,6 +1403,10 @@ class RoleRulePredictor():
                                                                                        _span[0]) is None:
                                     p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                     p_entity.label = 1
+                                elif re.search('(预算金额|最高(投标)?上?限[价额]?格?|招标控制价))?:?([\d.,]+万?元[,(]其中)?(第?[一二三四五0-9](标[段|包]|[分子]包):?[\d.,]+万?元,)*第?[一二三四五0-9](标[段|包]|[分子]包):?$'
+                                        , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
+                                    p_entity.values[0] = 0.8 + p_entity.values[0] / 10
+                                    p_entity.label = 0
 
             # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
             list_p = []
@@ -2976,7 +2983,8 @@ class DocChannel():
           '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
           '公告变更neg': '履约变更内容',
           '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示',
-          '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
+          '候选人公示neg': '中标候选人公示期',
+          '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
           '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
           '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
           '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家',
@@ -4248,7 +4256,7 @@ class DistrictPredictor():
             except Exception as e:
                 print('解析prem 获取招标人、及地址出错')
             return tenderee, tenderee_address
-        def get_area(text, web_source_name):
+        def get_area(text, web_source_name, not_in_content=True):
             score_l = []
             id_set = set()
 
@@ -4297,14 +4305,17 @@ class DistrictPredictor():
                         w = self.dist_dic[_id]['权重']
                         score = w * 0.2
                         score_l.append([_id, score] + area)
-            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知'}
+            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
             if len(score_l) == 0:
                 return {'district': area_dic}
             else:
                 df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
+                df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
+                # print('地区评分:')
+                # print(df)
                 df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
                 pro_id = df_pro.index[0]
-                if df_pro.loc[pro_id, 'score'] < 0.1:  # 省级评分小于0.1的不要
+                if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content:  # 不是二次全文匹配的 省级评分小于0.1的不要
                     # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
                     return {'district': area_dic}
                 area_dic['province'] = self.dist_dic[pro_id]['地区']
@@ -4325,15 +4336,32 @@ class DistrictPredictor():
                 return {'district': area_dic}
 
         tenderee, tenderee_address = get_ree_addr(prem)
-        project_name = str(project_name).replace(str(tenderee), '')
-        text1 = "{} {} {}".format(project_name, tenderee, tenderee_address)
+        project_name = str(project_name)
+        tenderee = str(tenderee)
+
+        if '##attachment##' in list_articles[0].content:
+            content, attachment = list_articles[0].content.split('##attachment##')
+            if len(content) < 200:
+                content += attachment
+        else:
+            content = list_articles[0].content
+
+        project_name = project_name + title if project_name not in title else project_name
+        project_name = project_name.replace(tenderee, '')
+        text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
+        ser = re.search('项目所在地区?:(\w{2,8}[省市区县])+', content)
+        if ser:
+            text1 = ser.group(0)
+
         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
         rs = get_area(text1, web_source_name)
+
         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
-            text2 = title + list_articles[0].content if len(list_articles[0].content)<2000 else title + list_articles[0].content[:1000] + list_articles[0].content[-1000:]
+            text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
-            rs2 = get_area(text2, web_source_name)
+            rs2 = get_area(text2, web_source_name, not_in_content=False)
+            rs2['district']['is_in_text'] = True
             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
                 rs = rs2
             elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
@@ -4342,7 +4370,7 @@ class DistrictPredictor():
 
 class TableTag2List():
     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
-    def table2list(self, table):
+    def table2list(self, table, text_process=None):
         self._output = []
         row_ind = 0
         col_ind = 0
@@ -4351,17 +4379,21 @@ class TableTag2List():
             # we should skip
             smallest_row_span = 1
 
+            if len(row.find_all(['td', 'th'], recursive=False)) > 20:
+                log('未补全前表格列数大于20的不做表格处理')
+                return []
+
             for cell in row.children:
                 if cell.name in ('td', 'th'):
                     # check multiple rows
                     # pdb.set_trace()
-                    row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and re.search('[0-9]', cell.get('rowspan')) else 1
+                    row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and cell.get('rowspan').isdigit() else 1
 
                     # try updating smallest_row_span
                     smallest_row_span = min(smallest_row_span, row_span)
 
                     # check multiple columns
-                    col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and re.search('[0-9]', cell.get('colspan')) else 1
+                    col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and cell.get('colspan').isdigit() else 1
 
                     # find the right index
                     while True:
@@ -4371,15 +4403,19 @@ class TableTag2List():
 
                     # insert into self._output
                     try:
-                        text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
-                        text = re.sub('\s', '', text)[:200] # 只需取前200字即可
+                        if text_process != None:
+                            text = [re.sub('\xa0','',text_process(cell,final=False)),0]
+                        else:
+                            text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
+                            text = re.sub('\s', '', text)[:200] # 只需取前200字即可
+                            text = ' ' if text == "" else text
                         self._insert(row_ind, col_ind, row_span, col_span, text)
                     except UnicodeEncodeError:
                         raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
 
                     # update col_ind
                     col_ind += col_span
-                    if col_ind > 50: # 表格列数大于50的去掉
+                    if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
                         return []
 
             # update row_ind
@@ -4401,7 +4437,7 @@ class TableTag2List():
             return True
         if j >= len(self._output[i]):
             return True
-        if self._output[i][j] is None:
+        if self._output[i][j] == "":
             return True
         return False
 
@@ -4427,7 +4463,7 @@ class TablePremExtractor(object):
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
+            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品)(名称?|内容)",
             "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
@@ -4444,7 +4480,7 @@ class TablePremExtractor(object):
     def find_header(self, td_list):
         header_dic = dict()
         flag = False
-        if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>0.6:
+        if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
             flag = True
             for i in range(len(td_list)) :
                 text = td_list[i]
@@ -4462,6 +4498,19 @@ class TablePremExtractor(object):
                 if num>1:
                     print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, dict()
+            if re.search(';金额(万?元);', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
+                if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
+                    for i in range(len(td_list)):
+                        text = td_list[i]
+                        if  re.search('^金额(万?元)$',text):
+                            header_dic['bid_amount'] = (i, text)
+                            break
+                elif 'tenderee' in header_dic and 'budget' not in header_dic:
+                    for i in range(len(td_list)):
+                        text = td_list[i]
+                        if re.search('^金额(万?元)$', text):
+                            header_dic['budget'] = (i, text)
+                            break
             if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
                     'budget' in header_dic or 'tenderer' in header_dic):
                 return flag, header_dic
@@ -4610,6 +4659,7 @@ class TablePremExtractor(object):
     def get_prem(self, soup):
         tables = soup.find_all('table')
         tables.reverse()
+
         rs_dic = {}
         for table in tables:
             trs = self.tb.table2list(table)
@@ -4631,7 +4681,7 @@ class TablePremExtractor(object):
                         else:
                             print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
                             break
-                    if len(table_items) > 1:
+                    if len(table_items) > 0:
                         df = pd.DataFrame(table_items)
                         prem_ = self.extract_from_df(df, headers)
                         rs_dic.update(prem_)
@@ -4647,6 +4697,10 @@ class TablePremExtractor(object):
         prem = self.get_prem(soup)
         if prem == {} and richText:
             prem = self.get_prem(richText)
+        if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
+            k = list(prem)[0]
+            if k == '1' or len(k) > 2:
+                prem['Project'] = prem.pop(k)
         return prem
 
 class CandidateExtractor(object):