Prechádzať zdrojové kódy

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BiddingKG/dl/interface/extract.py
luojiehua 6 mesiacov pred
rodič
commit
375d7f8698

+ 5 - 0
.gitignore

@@ -9,6 +9,7 @@
 /BiddingKG/dl/product/data/
 /BiddingKG/dl/channel/data/
 /BiddingKG/dl_dev/test
+/BiddingKG/dl_dev/test2
 /BiddingKG/dl/test
 node_modules
 /BiddingKG/dl/table_head/train_data/
@@ -16,3 +17,7 @@ node_modules
 /BiddingKG/dl/table_head/checkpoints/
 /BiddingKG/dl/table_head/data_new.csv
 /BiddingKG/dl/table_head/has_table_no_attach.xlsx
+/BiddingKG/dl/LEGAL_ENTERPRISE.txt
+/BiddingKG/dl_dev/
+BiddingKG.iml
+misc.xml

+ 25 - 1
BiddingKG/dl/common/Utils.py

@@ -1009,7 +1009,7 @@ def find_package(content):
             '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
             # print('过滤掉错误包:', iter.group())
             continue
-        elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书',
+        elif iter.end() + 2 < len(content) and re.search('标的物|包装|划分|标(|准|志|记|识|签|贴|帜|本|底|价|量)',
                                                          content[iter.start():iter.end() + 2]):
             # print('过滤掉错误包:', iter.group())
             continue
@@ -1122,6 +1122,30 @@ def del_tabel_achievement(soup):
             del_tag = tr.extract()
             # print('删除表格业绩内容', del_tag.text)
 
+def is_all_winner(title):
+    '''
+    是否提取所有投标人作为中标人,存管类不分排名都作中标人;入围类按排名,无排名都做中标人
+    :param title: 标题
+    :return:
+    '''
+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
+        return 1
+    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商', title):
+        return 2
+    return False
+
+def is_deposit_project(title, name, requirement):
+    '''
+    通过正则判断项目是否为银行存款类项目
+    :param title: 标题
+    :param name: 项目名称
+    :param requirement: 采购内容
+    :return:
+    '''
+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)((.{2,10}))?存放|存放银行|存款(服务|业务|项目)|国库现金管理|存款账户开户|(管理|存款|合作)(定点|专户)?银行|贷款合作银行|资金监管账户|开户银行项目|专户开户银行|银行专户选择|定期存[款放]|专项债券?专用账户', title+name+requirement):
+        return True
+    return False
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 3 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -179,7 +179,7 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                     if have_bus:
                         lb, prob = get_role(dic)
                         bus_dic[_entity.entity_text] = (lb, prob)
-                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
+                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
                             bus_tenderee.append(_entity)
                     elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据
                         have_bus = True
@@ -236,6 +236,8 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                         _ent = long_entity[second_i]
                         if _ent.label in [0,1,5]:
                             if len(_entity.entity_text)<len(_ent.entity_text) and is_short(_entity.entity_text, _ent.entity_text):  # 简称顺序包含在工商名称内的替换
+                                if _entity.entity_text.endswith('大学'): # 修复 533357339 东北大学 替换为 中国银行沈阳东北大学支行
+                                    continue
                                 _entity.entity_text = _ent.entity_text
                                 lb, prob = bus_dic[_entity.entity_text]
                                 if lb in [0, 1] and prob > 0.9 and _entity.values[

+ 1 - 1
BiddingKG/dl/interface/Entitys.py

@@ -342,7 +342,7 @@ class Role():
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
-        if result['role_name'] == 'tenderee':
+        if result['role_name'] in ['tenderee', 'win_tenderer']:
             result['role_prob'] = self.role_prob
         if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
             self.multi_winner.add(result['role_text'])

+ 75 - 26
BiddingKG/dl/interface/Preprocessing.py

@@ -8,7 +8,7 @@ import time
 import codecs
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
-from BiddingKG.dl.table_head.predict import predict
+from BiddingKG.dl.table_head.predict_torch import predict
 
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
@@ -708,6 +708,8 @@ def tableToText(soup, docid=None):
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
+                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式'] and predict_list[i][j]!=1:
+                    inner_table[i][j] = [origin_inner_table[i][j][0], 1]
 
         if show:
             print(inner_table)
@@ -1402,10 +1404,25 @@ def tableToText(soup, docid=None):
                 in_attachment = True
     #逆序处理嵌套表格
     # print('len(tbodies)1', len(tbodies))
-    for tbody_index in range(1,len(tbodies)+1):
+    # for tbody_index in range(1,len(tbodies)+1):
+    tbody_index = 1
+    while tbody_index < len(tbodies)+1:
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况;例:526321576
+            for row in tbody.find_all('tr'):
+                if tbodies[tbody_index - 1][0].tbody:
+                    tbodies[tbody_index - 1][0].tbody.append(row)
+                else:
+                    tbodies[tbody_index - 1][0].append(row)
+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
+            list_innerTable.append(inner_table)
+            tbody_index += 2
+            continue
         inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
+        tbody_index += 1
 
     # tbodies = soup.find_all('tbody')
     # 遍历表格中的每个tbody
@@ -1418,11 +1435,25 @@ def tableToText(soup, docid=None):
             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
                 in_attachment = True
     #逆序处理嵌套表格
-    # print('len(tbodies)2', len(tbodies))
-    for tbody_index in range(1,len(tbodies)+1):
+    tbody_index = 1
+    # for tbody_index in range(1,len(tbodies)+1):
+    while tbody_index < len(tbodies) + 1:
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况;例:526321576
+            for row in tbody.find_all('tr'):
+                if tbodies[tbody_index - 1][0].tbody:
+                    tbodies[tbody_index - 1][0].tbody.append(row)
+                else:
+                    tbodies[tbody_index - 1][0].append(row)
+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
+            list_innerTable.append(inner_table)
+            tbody_index += 2
+            continue
         inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
+        tbody_index += 1
 
     return soup
     # return list_innerTable
@@ -2085,8 +2116,8 @@ def segment(soup,final=True):
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
-            if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
-                child.insert_before("。")
+            # if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割 20240930 注销,修复529501491关键词 三、中标供应商:(一)单位名称被分句
+            #     child.insert_before("。")
         # if child.name == 'div' and 'class' in child.attrs:
         #     # 添加附件"attachment"标识
         #     if "richTextFetch" in child['class']:
@@ -2919,6 +2950,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = segment(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
+        article_processed = article_processed.replace('侯选人', '候选人')  #2024/09/03 修复错别字避免预测错误。
+        article_processed = article_processed.replace('人选人', '入选人')  #2024/09/03 修复错别字避免预测错误。
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
         article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", ":", article_processed)
         article_processed = article_processed.replace('.','.').replace('-', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
@@ -2929,7 +2962,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
-        article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
+        article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20}))?:?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题 例:530377517
         article_processed = re.sub('(不?含(可抵扣增值|\w{,8})税)', '', article_processed)    # 120637247 投标报价(元),(含可抵扣增值税):277,560.00。
         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
@@ -2937,6 +2970,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
         article_processed = re.sub('例:建设银行(甲方全称)', ' ', article_processed)  # 2024/06/12 特殊表达修改 修改 481513912 金采网 附件模板导致错误提取招标人
         article_processed = re.sub('^[,,.。;;、]+', '', article_processed)
+        article_processed = re.sub('资,金', '资金', article_processed)
+        article_processed = re.sub('金,额', '金额', article_processed)
+        article_processed = re.sub('存,款', '存款', article_processed)
         if web_source_no.startswith('DX002756-'):
             article_processed = re.sub('状态:(进行中|已结束)单位', ',项目单位', article_processed)  # 376225646
         if web_source_no.startswith('DX006116-') and re.search('结果公告如下:.{5,50},单位名称:', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况
@@ -2961,6 +2997,11 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed = article_processed[:idx]
         for it in re.finditer('[一二三四五六七八九十\d]、中标候选人名称,', article_processed): # 修复大纲类标点导致提取不到,例:515521734
             article_processed = re.sub(it.group(0), it.group(0)[:-1]+':', article_processed)
+        ser = re.search('项目[编代][码号]/项目名称:(?P<code>(\[审批\])?[\d\-]{10,30})/?(?P<name>[\u4e00-\u9fa5()]{4,35}[,。])', article_processed) # 优化项目编号名称一起写的情况 spxm-53340116.html
+        if ser:
+            article_processed = article_processed.replace(ser.group(0), '项目代码:%s,项目名称:%s' % (
+            ser.group('code'), ser.group('name')))
+        article_processed = re.sub('四舍五入至', '', article_processed) # 修复 533537050 ,中标价(四舍五入至万元):6468万元
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3053,6 +3094,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         cost_time[key_preprocess] += time.time()-start_time
 
         #nlp处理
+        outline_list = [] # 20240906 修复下面条件不成立时,后面 list_outlines.append(outline_list) 名称未定义报错
         if article_processed is not None and len(article_processed)!=0:
             split_patten = "。"
             sentences = []
@@ -3180,7 +3222,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     # 使用正则识别金额
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
@@ -3230,7 +3272,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                         if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
                             entity_text = entity_text[:-3]
                     if k.split("_")[0] == "unit":
-                        if v == '万元' or unit == "":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元):160万元  总价(万元):最终报价:695000.00(元)
                             unit = v
                     if k.split("_")[0] == "text":
                         # print('text_before: ', _match.group(k))
@@ -3286,6 +3328,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                     unit = '万元'
                 elif re.search('^,?(价格币种:\w{2,3},)?价格单位:万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格:39501.094425,价格币种:人民币,价格单位:万元,
                     unit = '万元'
+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
+                    unit = '万元'
                 elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
@@ -3320,16 +3364,16 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
             entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
             # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
-            if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额',
-                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
+            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
                 # print('总投资金额: ', _match.group(0))
                 notes = '总投资'
             elif re.search('投资|概算|建安费|其他费用|基本预备费',
                            sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                 notes = '投资'
-            elif re.search('工程造价',
-                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
-                notes = '工程造价'
+            # elif re.search('工程造价',
+            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
+            #     notes = '工程造价'
             elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
                   or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
                                sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
@@ -3507,11 +3551,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     re.search('\d[楼层号]', entity_text)==None: # 2024/06/07 修改错误地址实体为角色
                     entity_type = 'org'
 
-                if entity_text.startswith('石山县'): # 2024/04/24 修复实体识别积石山县 识别少字问题
-                    entity_text = '积' + entity_text
-                    if 0<=begin_index_temp-1<len(sentence_text) and sentence_text[begin_index_temp-1] == '积':
-                        begin_index_temp -= 1
-                        ner_entity = (begin_index_temp, end_index_temp, entity_type, entity_text)
+                if begin_index_temp>0 and '县' in entity_text and re.match('前郭尔罗斯蒙古族自治县|积石山县', sentence_text[begin_index_temp-1:end_index_temp]): #20240905 修复实体识别少字问题
+                    entity_text = sentence_text[begin_index_temp-1] + entity_text
+                    begin_index_temp -= 1
+                    ner_entity = (begin_index_temp, end_index_temp, entity_type, entity_text)
                 elif entity_text == '中华人民共和国' and re.search('^\w{2,4}海关', sentence_text[end_index_temp: end_index_temp+6]):  # 2024/04/24 修复 采购单位:中华人民共和国汕尾海关, 识别不到海关
                     ser = re.search('^\w{2,4}海关', sentence_text[end_index_temp: end_index_temp+6])
                     entity_text += ser.group(0)
@@ -4037,11 +4080,17 @@ if __name__=="__main__":
     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
     # getPredictTable()
-    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
-        sourceContent = f.read()
-        # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
-        # print(article_processed)
 
-        list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
-        for entity in list_entitys[0]:
-            print(entity.entity_type, entity.entity_text)
+    text = '是否拟中标人:是,评标排名:1,价格类型:(万元)报价:13311.1582,得分:84.46,项目负责人:邓焱文'
+    text = ',采购包1:采购包预算金额(元:1,500000.00,采购包最高限价(元:1,430600.00,'
+    text = '成交人:中坤电力有限公司,成交价格:11493,603.52元,质量:合格,项目工期:117天,'
+    # text = '数量及单位1:65台,单价2:800,投标报价3=1×2:52000。'
+    print(get_money_entity(text, found_yeji=0))
+    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
+    #     sourceContent = f.read()
+    #     # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
+    #     # print(article_processed)
+    #
+    #     list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
+    #     for entity in list_entitys[0]:
+    #         print(entity.entity_type, entity.entity_text)

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 109 - 30
BiddingKG/dl/interface/extract.py


+ 162 - 110
BiddingKG/dl/interface/getAttributes.py

@@ -464,6 +464,12 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
     win_tenderer_set = set() # 记录所有预测为中标的实体集合
   # print(PackageList)
     #拿到各个实体的packageName,packageCode
+    main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
+    for entity in list_entity:
+        if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
+            main_contain_winner = True
+            break
+
     for entity in list_entity:
         if entity.entity_type in ['org','company']:
             #限制附件里角色values[label]最大概率prob
@@ -477,6 +483,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
             values = entity.values
             role_prob = float(values[int(entity.label)])
             if role_prob>=on_value and str(entity.label)!="5":
+                if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人,不再提取附件中标人 避免 例:504046747 附件角色OCR错字变两个标段
+                    continue
                 if str(entity.label) in ["0","1"]:
                     packageName = "Project"
                 else:
@@ -583,7 +591,7 @@ def getPackageScopePattern():
     for item in df["list_word"]:
         item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
         pattern += item+"|"
-    pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
+    pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##"
     return pattern
         
 pattern_packageScope = getPackageScopePattern()   
@@ -824,11 +832,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
                         scope_begin = [PackageList_scope[j]["sentence_index"],
                                        PackageList_scope[j]["offsetWords_begin"]]
                     else:
-                        if j == 0:
-                            scope_begin = [0, 0]
-                        else:
-                            scope_begin = [PackageList_scope[j - 1]["sentence_index"],
-                                           PackageList_scope[j - 1]["offsetWords_begin"]]
+                        scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
+                        # if j == 0:
+                        #     scope_begin = [0, 0]
+                        # else:
+                        #     scope_begin = [PackageList_scope[j - 1]["sentence_index"],
+                        #                    PackageList_scope[j - 1]["offsetWords_begin"]]
+
                     if j == len(PackageList_scope) - 1:
                         scope_end = [list_sentence[-1].sentence_index,
                                      changeIndexFromWordToWords(list_sentence[-1].tokens,
@@ -943,7 +953,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
     def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
         for i in range(len(packDict[packageName]["roleList"])):
-            if packDict[packageName]["roleList"][i].entity_text==entity:
+            if packDict[packageName]["roleList"][i].entity_text==entity and not packDict[packageName]["roleList"][i].serviceTime:
                 # packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
                 packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"")
 
@@ -1591,7 +1601,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
                             continue
                         # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                        if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
+                        if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
+                            continue
+                        # 角色为招标/代理人,排除"纪检|监察"相关的联系人
+                        if _subject.label in [0,1] and re.search("纪检|监察",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
                             continue
                         if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
                             if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
@@ -2024,7 +2037,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if entity.label in [2, 3, 4] and distance>=20:
                                     break
                                 # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                                if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                    break
+                                # 角色为招标/代理人,排除"纪检|监察"相关的联系人
+                                if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
                                 if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
                                     if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
@@ -2109,7 +2125,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                             new_split_list[split_index][1]:
                                         mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
                                         if re.search(key_phone, mid_sentence):
-                                            if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
+                                            if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
                                                 pass
                                             else:
                                                 distance = 1
@@ -2162,7 +2178,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                         p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
                                         if next_entity.entity_type == 'person' and _phone in p_phone:
                                             pass
-                                        elif entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
+                                        elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
                                             pass
                                         else:
                                             distance = (tokens_num_dict[
@@ -2913,6 +2929,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             if get_tenderee_contacts:
                                 break
 
+    # 如果同一个电话连到了不同的单位就直接去掉(2024-09-03 新增)
+    get_phone_dict = dict()
+    for k in PackDict.keys():
+        for i in range(len(PackDict[k]["roleList"])):
+            for item in PackDict[k]["roleList"][i].linklist:
+                if item[1]:
+                    if item[1] not in get_phone_dict:
+                        get_phone_dict[item[1]] = set()
+                    get_phone_dict[item[1]].add(PackDict[k]["roleList"][i].entity_text)
+    # print(get_phone_dict)
+    remove_phone = []
+    for phone,role_list in get_phone_dict.items():
+        if len(role_list)>1:
+            remove_phone.append(phone)
+    for k in PackDict.keys():
+        for i in range(len(PackDict[k]["roleList"])):
+            remove_list = []
+            for item in PackDict[k]["roleList"][i].linklist:
+                if item[1] and item[1] in remove_phone:
+                    remove_list.append(item)
+            for _item in remove_list:
+                PackDict[k]["roleList"][i].linklist.remove(_item)
+
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
@@ -4223,9 +4262,9 @@ def limit_maximum_amount(dic, list_entity):
                     if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
                         # print('单位元小金额且格式类似万元的乘以万倍')
                         l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
-                    else:
-                        # print('中标金额小于限额:%d元 去除' % minximum_amount)
-                        l["role_money"]['money'] = 0
+                    # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
+                    #     # print('中标金额小于限额:%d元 去除' % minximum_amount)
+                    #     l["role_money"]['money'] = 0
 
             if float(value['tendereeMoney']) > maximum_amount:
                 flag = 1
@@ -4246,9 +4285,9 @@ def limit_maximum_amount(dic, list_entity):
                 if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
                     # print('单位元小金额且格式类似万元的乘以万倍')
                     value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
-                else:
-                    # print('招标金额小于限额:%d元 去除' % minximum_amount)
-                    value['tendereeMoney'] = 0
+                # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
+                #     # print('招标金额小于限额:%d元 去除' % minximum_amount)
+                #     value['tendereeMoney'] = 0
 
 
 def limit_maximum_amount_backup(prem, industry):
@@ -4296,69 +4335,66 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     :return:
     '''
     try:
-        if 'win_tenderer' in str(prem) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
+        if 'win_tenderer' in str(prem[0]['prem']) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
             sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
-            for project in prem[0].values():
-                if not isinstance(project, dict):
-                    continue
-                for v in project.values():
-                    for d in v['roleList']:
-                        if d.get('role_name', '') == 'win_tenderer':
-                            winner = d.get('role_text')
-                            join_l = [winner]
-                            for list_entity in list_entitys:
-                                for i in range(len(list_entity)-1):
-                                    _entity = list_entity[i]
-                                    b = _entity.wordOffset_begin
-                                    e = _entity.wordOffset_end
-                                    if _entity.entity_type in ['org', 'company'] and _entity.label==2\
-                                            and _entity.entity_text==winner:
-                                        s = sentences[_entity.sentence_index].sentence_text
-                                        find_joint = 0 # 是否包含联合体
-                                        for j in range(i+1, len(list_entity)):
-                                            behind_entity = list_entity[j]
-                                            b2 = behind_entity.wordOffset_begin
-                                            e2 = behind_entity.wordOffset_end
-                                            if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
-                                                    and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
-                                                re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
-                                                join_l.append(behind_entity.entity_text)
-                                                b = b2
-                                                e = e2
-                                                find_joint = 1
-                                            elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
-                                                join_l.append(behind_entity.entity_text)
-                                                b = b2
-                                                e = e2
-                                            elif e == e2: # 修复重复实体导致中断情况
-                                                continue
-                                            else:
-                                                break
-                                        if len(join_l)>1:
-                                            d['win_tenderer_joint'] = ','.join(set(join_l))
-
-
-
-                                            # behind_entity = list_entity[i + 1]
-                                    # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
-                                    #         and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
-                                    #     s = sentences[_entity.sentence_index].sentence_text
-                                    #     b = _entity.wordOffset_begin
-                                    #     e = _entity.wordOffset_end
-                                    #     b2 = behind_entity.wordOffset_begin
-                                    #     e2 = behind_entity.wordOffset_end
-                                        # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
-                                        #     print('联合体:', s[max(0, b-10):e2+10])
-                                        #     d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
-                                        #     break
-                                        # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
-                                        #     d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
-                                        #     print('联合体:', s[max(0, b - 10):e2 + 10])
-                                        #     break
+            for v in prem[0]['prem'].values():
+                for d in v['roleList']:
+                    if d.get('role_name', '') == 'win_tenderer':
+                        winner = d.get('role_text')
+                        join_l = [winner]
+                        for list_entity in list_entitys:
+                            for i in range(len(list_entity)-1):
+                                _entity = list_entity[i]
+                                b = _entity.wordOffset_begin
+                                e = _entity.wordOffset_end
+                                if _entity.entity_type in ['org', 'company'] and _entity.label==2\
+                                        and _entity.entity_text==winner:
+                                    s = sentences[_entity.sentence_index].sentence_text
+                                    find_joint = 0 # 是否包含联合体
+                                    for j in range(i+1, len(list_entity)):
+                                        behind_entity = list_entity[j]
+                                        b2 = behind_entity.wordOffset_begin
+                                        e2 = behind_entity.wordOffset_end
+                                        if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
+                                                and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
+                                            re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
+                                            join_l.append(behind_entity.entity_text)
+                                            b = b2
+                                            e = e2
+                                            find_joint = 1
+                                        elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
+                                            join_l.append(behind_entity.entity_text)
+                                            b = b2
+                                            e = e2
+                                        elif e == e2: # 修复重复实体导致中断情况
+                                            continue
+                                        else:
+                                            break
+                                    if len(join_l)>1:
+                                        d['win_tenderer_joint'] = ','.join(set(join_l))
+
+
+
+                                        # behind_entity = list_entity[i + 1]
+                                # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
+                                #         and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
+                                #     s = sentences[_entity.sentence_index].sentence_text
+                                #     b = _entity.wordOffset_begin
+                                #     e = _entity.wordOffset_end
+                                #     b2 = behind_entity.wordOffset_begin
+                                #     e2 = behind_entity.wordOffset_end
+                                    # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
+                                    #     print('联合体:', s[max(0, b-10):e2+10])
+                                    #     d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
+                                    #     break
+                                    # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
+                                    #     d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
+                                    #     print('联合体:', s[max(0, b - 10):e2 + 10])
+                                    #     break
     except Exception as e:
         print('获取联合体抛出异常', e)
 
-def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
+def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
     '''
     获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
     :param channel_dic:
@@ -4369,7 +4405,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
     '''
 
     def add_multi_winner(pack_l, winner_l):
-        if len(prem[0]) > 1 and len(set([it[0] for it in pack_l])) > 1:  # 多标段多中标人处理
+        if len(prem[0]['prem']) > 1 and len(set([it[0] for it in pack_l])) > 1:  # 多标段多中标人处理
             pk_dic = {}
             for ent in winner_l:
                 for i in range(len(pack_l)):
@@ -4395,40 +4431,33 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                 multi_winner = multi_winner - tenderee_or_agency
                 if len(multi_winner) < 2:
                     continue
-                for project in prem[0].values():
-                    if not isinstance(project, dict):
-                        continue
-                    for k, v in project.items():
-                        if pk == k:
-                            for d in v['roleList']:
-                                if d.get('role_name', '') == 'win_tenderer':
-                                    if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
-                                        d['multi_winner'] = ','.join(set(multi_winner))
-        else:
-            multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
-            if len(multi_winner) > 1:
-                for project in prem[0].values():
-                    if not isinstance(project, dict):
-                        continue
-                    for v in project.values():
+                for k, v in prem[0]['prem'].items():
+                    if pk == k:
                         for d in v['roleList']:
                             if d.get('role_name', '') == 'win_tenderer':
                                 if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
                                     d['multi_winner'] = ','.join(set(multi_winner))
-                                break
+        elif 0 < len(prem[0]['prem']) < 3: # 修复 单包多中标人 例:285780273
+            multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
+            if len(multi_winner) > 1:
+                for v in prem[0]['prem'].values():
+                    for d in v['roleList']:
+                        if d.get('role_name', '') == 'win_tenderer':
+                            if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
+                                d['multi_winner'] = ','.join(set(multi_winner))
+                            break
 
     moneys = []
     moneys_attachment = []
-    if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
+    if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
         sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
-        entitys = sorted(list_entitys[0], key=lambda x: x.sentence_index)
         finalists = [] # 入围供应商
         multi_winner_l = [] # 保存中标人名称列表
         tenderee_or_agency = set()
         package_l = []
         i = 0
-        while i < len(entitys)-1:
-            ent = entitys[i]
+        while i < len(list_entitys[0])-1:
+            ent = list_entitys[0][i]
             b_idx_fr = ent.wordOffset_begin
             e_idx_fr = ent.wordOffset_end
             i += 1
@@ -4440,19 +4469,18 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     moneys.append(money)
             elif ent.entity_type in ['package']:
                 package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
-            elif ent.entity_type in ['org', 'company'] and ent.label in [0,1] and ent.values[ent.label] > 0.8:
-                tenderee_or_agency.add(ent.entity_text)
-            elif ent.entity_type in ['org', 'company'] and ent.label == 2:
+            elif ent.entity_type in ['org', 'company']:
                 sentence_text = sentences[ent.sentence_index].sentence_text
                 pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
-                if ent.values[ent.label] > 0.8:
+                if ent.label in [0,1] and ent.values[ent.label] > 0.8:
+                    tenderee_or_agency.add(ent.entity_text)
+                elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
                     multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
                     for j in range(i, len(list_entitys[0])):
                         ent_bh = list_entitys[0][j]
                         b_idx_bh = ent_bh.wordOffset_begin
                         e_idx_bh = ent_bh.wordOffset_end
-                        if ent_bh.entity_type in ['org','company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
-                            sentence_text = sentences[ent_bh.sentence_index].sentence_text
+                        if ent_bh.entity_type in ['org','company'] and ent_bh.label in [2,5] and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
                             if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
                                     len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']):  # 修复多中标人刚好在文末index超出报错,例子 407126558
                                 multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
@@ -4460,7 +4488,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                                 i = j + 1
                             else:
                                 break
-                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
                             multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                             e_idx_fr = e_idx_bh
                             i = j + 1
@@ -4470,6 +4498,8 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                             break
                     if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
                         finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
+                    multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
 
         if len(multi_winner_l)>=2:
             winner_main = [it for it in multi_winner_l if not it[3]]
@@ -4532,6 +4562,10 @@ def update_prem(old_prem, new_prem, in_attachment=False):
             k = list(old_prem.keys()-set(['Project']))[0]
             k_new = list(new_prem.keys())[0]
             new_prem[k] = new_prem.pop(k_new)
+        elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同,把表格提取包名替换为非表格包名
+            k = list(old_prem.keys()-set(['Project']))[0]
+            k_new = list(new_prem.keys())[0]
+            new_prem[k] = new_prem.pop(k_new)
 
         if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project
             k = list(new_prem.keys())[0]
@@ -4552,6 +4586,8 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 tmp_l.append(d2)
                                 if d2['role_text'] != "":
                                     d['role_text'] = d2['role_text']
+                                if d2['serviceTime'] != "":
+                                    d['serviceTime'] = d2['serviceTime']
                                 if float(d2['role_money']['money']) != 0:  # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
@@ -4585,12 +4621,14 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 tmp_l.append(d2)
                                 if d2['role_text'] != "":
                                     d['role_text'] = d2['role_text']
+                                if d2['serviceTime'] != "":
+                                    d['serviceTime'] = d2['serviceTime']
                                 if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
-                                for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
-                                    if d2[k]:
-                                        d[k] = d2[k]
+                                for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
+                                    if d2[k2]:
+                                        d[k2] = d2[k2]
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
@@ -4601,7 +4639,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
     # return old_prem
 
-def  confirm_prem(prem, channel_dic):
+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
     '''
     规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
     :param prem: prem 字段字典
@@ -4610,6 +4648,8 @@ def  confirm_prem(prem, channel_dic):
     if len(prem) > 1:  # 表格提取到中标人的,去掉project包中标人
         pro_winner = set()
         other_winner = set()
+        other_winner_prob = 0
+        pro_winner_prob = 0
         empty_roleList = []
         for k in prem:
             prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid
@@ -4623,21 +4663,33 @@ def  confirm_prem(prem, channel_dic):
                             pro_winner.update(set(d['win_tenderer_joint'].split(',')))
                         if 'multi_winner' in d:
                             pro_winner.update(set(d['multi_winner'].split(',')))
+                        if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
+                            pro_winner_prob = d.get('role_prob', 0)
                     else:
                         other_winner.add(d['role_text'])
                         if 'win_tenderer_joint' in d:
                             other_winner.update(set(d['win_tenderer_joint'].split(',')))
                         if 'multi_winner' in d:
                             other_winner.update(set(d['multi_winner'].split(',')))
-        if pro_winner & other_winner != set():
+                        if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
+                            other_winner_prob = d.get('role_prob', 0)
+        if pro_winner!=set() and (pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob): # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大,删除默认包中标人
             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',
                                                                       'third_tenderer']]
+        elif other_winner_prob<pro_winner_prob and len(prem)==2: # 两个包情况,如果默认包中标人概率比其他包大,删除其他包
+            rm_k = [k for k in prem if k != 'Project']
+            for k in rm_k:
+                prem.pop(k)
         if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
             for k in empty_roleList:
                 prem.pop(k)
     elif "Project" in prem:
         prem['Project']['uuid'] = str(uuid.uuid4())
+    if is_deposit_project and float(total_tendereeMoney)!=0 and len(prem)==1: #20241107 存款类项目有总投资没招标金额且只有一个标段,把总投资作招标金额
+        for k in prem:
+            if float(prem[k]['tendereeMoney'])==0:
+                prem[k]['tendereeMoney'] = total_tendereeMoney
 
 
 def fix_single_source(prem, channel_dic, original_docchannel):

BIN
BiddingKG/dl/interface/header_set.pkl


+ 11 - 10
BiddingKG/dl/interface/htmlparser.py

@@ -534,16 +534,17 @@ class ParseDocument():
                                 has_product = True
                                 break
 
-            if _type=="sentence":
-                if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
-                    list_data[-1]["text"] += _text
-                    list_data[-1]["line_width"] = len(_text)
-                    _append = True
-                elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
-                    if list_data[-1]["line_width"]>=max_length*0.7:
-                        list_data[-1]["text"] += _text
-                        list_data[-1]["line_width"] = len(_text)
-                        _append = True
+            # 合并两个非标题句子 20241106 注销,由于 485441521 招标内容结束位置不对
+            # if _type=="sentence":
+            #     if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
+            #         list_data[-1]["text"] += _text
+            #         list_data[-1]["line_width"] = len(_text)
+            #         _append = True
+            #     elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
+            #         if list_data[-1]["line_width"]>=max_length*0.7:
+            #             list_data[-1]["text"] += _text
+            #             list_data[-1]["line_width"] = len(_text)
+            #             _append = True
 
             if _type=="table":
                 _soup = BeautifulSoup(_text,"lxml")

+ 3 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -104,6 +104,9 @@ class Model_role_classify_word():
             text = re.sub('(最终)?排名:', '    ', text)
         text = re.sub('交易单位', '发布单位', text)
         text = re.sub('[,:]各种数据:', ':', text) # 20240620优化 478331984 山东省交通运输厅站源提取不到 各种数据:中标单位,各种数据:济南金曰公路工程有限公司,
+        text = re.sub('电子签章', '', text) # 20240924 修复 529923459 电子签名:投标人名称(电子签章:西君兰信息科技有限公司,2024年9月7日 预测为中标
+        text = re.sub('采购方式', 'xxxx', text) # 修复 499096797 招标人预测错误
+        text = re.sub('中标人\d名称', '中标人名称', text) # 修复 499096797 中标人预测错误
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):

+ 24 - 13
BiddingKG/dl/interface/outline_extractor.py

@@ -52,7 +52,6 @@ def extract_sentence_list(sentence_list):
                 new_sentence2_list_attach.append(sentence2)
             else:
                 new_sentence2_list.append(sentence2)
-
     return new_sentence2_list, new_sentence2_list_attach
 
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
@@ -63,18 +62,18 @@ addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
 out_lines = []
 
-def extract_parameters(parse_document, content):
+def extract_parameters(parse_document):
     '''
     通过大纲、预处理后文本正则获取需要字段
     :param parse_document: ParseDocument() 方法返回结果
-    :param content: 公告预处理后文本
     :return:
     '''
     list_data = parse_document.tree
-    requirement_text = ''
-    aptitude_text = ''
-    addr_bidopen_text = ''
-    addr_bidsend_text = ''
+    requirement_text = '' # 采购内容
+    aptitude_text = '' # 资质要求
+    addr_bidopen_text = '' # 开标地址
+    addr_bidsend_text = '' # 投标地址
+    requirement_scope = [] # 采购内容始末位置
 
     _find_count = 0
     _data_i = -1
@@ -86,14 +85,18 @@ def extract_parameters(parse_document, content):
         # print(_data.keys())
         if _type=="sentence":
             if _data["sentence_title"] is not None:
-                if re.search('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', _text[:10]):
+                if re.search('[((][一二三四五六七八九十}]+[))]|[一二三四五六七八九十]+\s*、|^\d{1,2}[.、][\u4e00-\u9fa5]', _text[:10]):
                     out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
 
                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
+                    b = (_data['sentence_index'], _data['wordOffset_begin'])
                     childs = get_childs([_data])
                     for c in childs:
                         # requirement_text += c["text"]+"\n"
                         requirement_text += c["text"]
+                    e = (c['sentence_index'], c["wordOffset_end"]) if len(childs)>0 else (_data['sentence_index'], _data['wordOffset_end'])
+                    requirement_scope.append(b)
+                    requirement_scope.append(e)
                     _data_i += len(childs)
                     _data_i -= 1
     _data_i = -1
@@ -161,15 +164,23 @@ def extract_parameters(parse_document, content):
         addr_bidopen_text = addr_bidopen_text[b:e]
     elif re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
         addr_bidopen_text = ""
-    if addr_bidopen_text == "":
-        ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
-        if ser:
-            addr_bidopen_text = ser.group(0)
     if re.search('时间:', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
             b, e = ser.span()
         addr_bidsend_text = addr_bidsend_text[b:e]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope
+
+def extract_addr(content):
+    '''
+    通过正则提取地址
+    :param content:  公告预处理后文本
+    :return:
+    '''
+    addr_bidopen_text = ''
+    ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
+    if ser:
+        addr_bidopen_text = ser.group(0)
+    return addr_bidopen_text
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 383 - 210
BiddingKG/dl/interface/predictor.py


BIN
BiddingKG/dl/table_head/model_40_0.951.pth


BIN
BiddingKG/dl/table_head/model_40_0.959.pth


+ 83 - 0
BiddingKG/dl/table_head/models/model_torch.py

@@ -0,0 +1,83 @@
+import torch.nn as nn
+import torch
+
+
+class TableHeadModel(nn.Module):
+    def __init__(self):
+        super(TableHeadModel, self).__init__()
+        self.char_num = 20
+        self.char_embed = 60
+        self.char_embed_expand = 128
+
+        self.dense0 = nn.Linear(self.char_embed, self.char_embed_expand)
+
+        self.dense3 = nn.Linear(self.char_num * self.char_embed_expand, 64)
+        self.dense4 = nn.Linear(64, 1)
+
+        self.sigmoid = nn.Sigmoid()
+
+        self.ln_dnn_2 = nn.LayerNorm([64])
+
+        self.device = torch.device("cpu")
+
+        self.relu = nn.LeakyReLU()
+        self.dropout = nn.Dropout(0.3)
+
+        self.cnn1d_0 = nn.Conv1d(self.char_embed_expand,
+                                 self.char_embed_expand,
+                                 (3,), padding=self.get_padding(3))
+        self.cnn1d_1 = nn.Conv1d(self.char_embed_expand,
+                                 self.char_embed_expand,
+                                 (3,), padding=self.get_padding(3))
+
+        self.cnn3d_0 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
+                                 (3, 3, 3), padding=self.get_padding(3))
+        self.cnn3d_1 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
+                                 (3, 3, 3), padding=self.get_padding(3))
+
+    def get_padding(self, kernel_size, stride=1):
+        return (kernel_size - 1) // 2 * stride
+
+    def forward(self, x):
+        batch, row, col, char_num, char_embed = x.shape
+
+        # cnn 1d
+        cnn1d_x = torch.squeeze(x, 0)
+        cnn1d_x = cnn1d_x.view([row*col, char_num, char_embed])
+
+        cnn1d_x = self.dense0(cnn1d_x)
+
+        cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
+        cnn1d_x = self.cnn1d_0(cnn1d_x)
+        cnn1d_x = self.relu(cnn1d_x)
+        cnn1d_x = self.dropout(cnn1d_x)
+        cnn1d_x = self.cnn1d_1(cnn1d_x)
+        cnn1d_x = self.relu(cnn1d_x)
+        cnn1d_x = self.dropout(cnn1d_x)
+
+        cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
+        cnn1d_x = cnn1d_x.contiguous().view(row, col, char_num, self.char_embed_expand)
+        cnn1d_x = torch.unsqueeze(cnn1d_x, 0)
+        # print(cnn1d_x.shape)
+
+        # cnn 3d
+        cnn3d_x = torch.permute(cnn1d_x, [0, 4, 3, 1, 2])
+        cnn3d_x = self.cnn3d_0(cnn3d_x)
+        cnn3d_x = self.relu(cnn3d_x)
+        cnn3d_x = self.dropout(cnn3d_x)
+        cnn3d_x = self.cnn3d_1(cnn3d_x)
+        cnn3d_x = self.relu(cnn3d_x)
+        cnn3d_x = self.dropout(cnn3d_x)
+
+        cnn3d_x = torch.squeeze(cnn3d_x, 0)
+        cnn3d_x = torch.permute(cnn3d_x, [2, 3, 1, 0])
+        cnn3d_x = cnn3d_x.contiguous().view(row, col, char_num * self.char_embed_expand)
+
+        # dnn
+        x = self.dense3(cnn3d_x)
+        x = self.ln_dnn_2(x)
+        x = self.relu(x)
+        x = self.dense4(x)
+        x = self.sigmoid(x)
+        x = torch.squeeze(x, -1)
+        return x

+ 132 - 0
BiddingKG/dl/table_head/pre_process_torch.py

@@ -0,0 +1,132 @@
+#coding=utf-8
+import os
+import sys
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
+from BiddingKG.dl.common.Utils import embedding_word, embedding_word_forward
+
+
+def set_label(row, row_label):
+    if len(row) == 1:
+        row_label = [0 for x in row]
+    elif len(set(row)) == 1:
+        row_label = [0 for x in row]
+    else:
+        row_label = [0 if x in ["", " ", "/", '无', '-', '~~'] else row_label[i] for i, x in enumerate(row)]
+    return row_label
+
+
+def set_same_table_head(inputs, y_pred1):
+    inputs = torch.squeeze(inputs, 0)
+
+    for i in range(inputs.shape[0]):
+        for j in range(inputs.shape[1]-1):
+            col1 = inputs[i, j, :, :]
+            col2 = inputs[i, j+1, :, :]
+            if (torch.abs(col1 - col2) < 1e-4).all():
+                # print('same value', col1[abs(col1) > 0.], col2[abs(col1) > 0.])
+                if (y_pred1[i, j] <= 0.5 and y_pred1[i, j+1] <= 0.5) or (y_pred1[i, j] > 0.5 and y_pred1[i, j+1] > 0.5):
+                    continue
+                else:
+                    # print('differ label', y_pred[i, j], y_pred[i, j+1])
+                    y_pred1[i, j+1] = y_pred1[i, j]
+
+    for i in range(inputs.shape[1]):
+        for j in range(inputs.shape[0]-1):
+            row1 = inputs[j, i, :, :]
+            row2 = inputs[j+1, i, :, :]
+            if (torch.abs(row1 - row2) < 1e-4).all():
+                if (y_pred1[j, i] <= 0.5 and y_pred1[j+1, i] <= 0.5) or (y_pred1[j, i] > 0.5 and y_pred1[j+1, i] > 0.5):
+                    continue
+                else:
+                    # print('same value', row1[abs(row1) > 0.], row2[abs(row2) > 0.])
+                    # print('differ label', y_pred[i, j], y_pred[i, j+1])
+                    # print('before', x11[0, j, i], x11[0, j+1, i])
+                    y_pred1[j+1, i] = y_pred1[j, i]
+                    # print('after', x1[0, j, i],  x1[0, j+1, i])
+    return y_pred1
+
+
+def data_to_numpy29(data_list, data_label_list):
+    """
+    输出表格 (table_cnt, row, col, 20, 60)
+
+    :param data_list:
+    :param data_label_list:
+    :return:
+    """
+    data_num = len(data_list)
+
+    new_data_list = []
+    new_label_list = []
+    mask_list = []
+    for i in range(len(data_list)):
+        table = data_list[i]
+        table_label = []
+        if data_label_list:
+            table_label = data_label_list[i]
+        embed_list = []
+        label_list = []
+        mask = []
+        for j in range(len(table)):
+            row = table[j]
+            blank_list = [0 if x in ["", " ", "/"] else 1 for x in row]
+            mask.append(blank_list)
+            row = embedding_word_forward(row, shape=(len(row), 20, 60))
+            embed_list.append(row)
+            if data_label_list:
+                row_label = table_label[j]
+                # print(j, row_label)
+                row_label = [int(x) for x in row_label]
+                row_label = set_label(table[j], row_label)
+                label_list.append(row_label)
+        embed_list = np.array(embed_list, dtype=np.float32)
+        label_list = np.array(label_list, dtype=np.float32)
+        mask = np.array(mask, dtype=np.float32)
+        # print('embed_list.shape', embed_list.shape)
+        # print('label_list.shape', label_list.shape)
+        new_data_list.append(embed_list)
+        new_label_list.append(label_list)
+        mask_list.append(mask)
+
+    new_data_list = np.array(new_data_list, dtype=np.float32)
+    new_label_list = np.array(new_label_list, dtype=np.float32)
+    mask_list = np.array(mask_list, dtype=np.float32)
+    # print(new_data_list.shape)
+
+    return new_data_list, new_label_list, mask_list
+
+
+class CustomDatasetTiny40(Dataset):
+    def __init__(self, data_x, data_y, mode=0):
+        if mode in [0, 1]:
+            # Split -> Train, Test
+            split_size = int(len(data_x)*0.1)
+            test_x, test_y = data_x[:split_size], data_y[:split_size]
+            train_x, train_y = data_x[split_size:], data_y[split_size:]
+
+            if mode == 0:
+                self.data = train_x
+                self.targets = train_y
+            else:
+                self.data = test_x
+                self.targets = test_y
+        else:
+            pass
+
+        # self.data = data
+        # self.targets = targets
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        # x, y = data_to_numpy12([self.data[idx]], [self.targets[idx]])
+        x, y, mask = data_to_numpy29([self.data[idx]], [self.targets[idx]])
+        x = x[0]
+        y = y[0]
+        mask = mask[0]
+        return x, y, mask

+ 68 - 0
BiddingKG/dl/table_head/predict_torch.py

@@ -0,0 +1,68 @@
+import copy
+import os
+import sys
+import torch
+from torch.utils.data import DataLoader
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
+from BiddingKG.dl.table_head.models.model_torch import TableHeadModel
+from BiddingKG.dl.table_head.pre_process_torch import CustomDatasetTiny40, set_same_table_head, set_label
+
+device = torch.device("cpu")
+model_path = os.path.abspath(os.path.dirname(__file__)) + '/model_40_0.951.pth'
+batch_size = 1
+
+
+def predict(table_text_list):
+    if globals().get("model") is None:
+        print("="*15, "init table_head model", "="*15)
+        # 实例化模型
+        model = TableHeadModel()
+        model.to(device)
+        model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
+        # 将模型设置为评估模式
+        model.eval()
+        globals()["model"] = model
+    else:
+        model = globals().get("model")
+
+    if len(table_text_list) <= 0:
+        return []
+
+    data_x = copy.deepcopy(table_text_list)
+    data_y = [[0 for col in row] for row in data_x]
+
+    row_len = len(data_x)
+    col_len = len(data_x[0])
+
+    if col_len >= 50:
+        return data_y
+
+    if col_len >= 20:
+        batch_row_len = 50
+    else:
+        batch_row_len = 100
+
+    result_list = []
+    for i in range(0, row_len, batch_row_len):
+        batch_data_x = data_x[i:i+batch_row_len]
+        dataset = CustomDatasetTiny40([batch_data_x], [data_y], mode=0)
+        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+        # 存储预测结果
+        with torch.no_grad():
+            for data, targets, _ in data_loader:
+                data = data.to(device)
+                outputs = model(data)
+                outputs = set_same_table_head(data, outputs)
+                result = torch.zeros_like(outputs)
+                result[outputs >= 0.5] = 1
+                result = result.numpy().tolist()
+        result_list += result
+
+    # 设置一些特定的表头
+    for i in range(len(result_list)):
+        row = table_text_list[i]
+        row_label = result_list[i]
+        result_list[i] = set_label(row, row_label)
+
+    return result_list

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov