3 роки тому · 06d0f8ca74
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -110,6 +110,8 @@ def tableToText(soup):
 
				         for tr in trs:
			
 
				             tr_line = []
			
 
				             tds = tr.findChildren(['td','th'], recursive=False)
			
 
				+            if len(tds)==0:
			
 
				+                tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
			
 
				             for td in tds:
			
 
				                 tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
			
 
				                 #tr_line.append([td.get_text(),0])
			
@@ -660,9 +662,9 @@ def tableToText(soup):
 
				                 for i in range(head_begin,head_end):
			
 
				                     for w in range(len(inner_table[i])):
			
 
				                         if inner_table[i][w][1]==1:
			
 
				-                            _punctuation = ":"
			
 
				+                            _punctuation = "："
			
 
				                         else:
			
 
				-                            _punctuation = ","
			
 
				+                            _punctuation = "，"  #2021/12/15 统一为中文标点，避免 206893924 国际F座1108,1,009,197.49元
			
 
				                         if w>0:
			
 
				                             if inner_table[i][w][0]!= inner_table[i][w-1][0]:
			
 
				                                 text_line += inner_table[i][w][0]+_punctuation
			
@@ -994,15 +996,16 @@ def tableToText(soup):
 
				     pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
			
 
				 
			
 
				     list_innerTable = []
			
 
				-    tbodies = soup.find_all('tbody')
			
 
				+
			
 
				+    tbodies = soup.find_all('table')
			
 
				     # 遍历表格中的每个tbody
			
 
				     #逆序处理嵌套表格
			
 
				     for tbody_index in range(1,len(tbodies)+1):
			
 
				         tbody = tbodies[len(tbodies)-tbody_index]
			
 
				         inner_table = trunTable(tbody)
			
 
				         list_innerTable.append(inner_table)
			
 
				-    '''2021/10/19先找tbody 再找table,避免一个table内多个tbody造成数据丢失'''
			
 
				-    tbodies = soup.find_all('table')
			
 
				+
			
 
				+    tbodies = soup.find_all('tbody')
			
 
				     # 遍历表格中的每个tbody
			
 
				     #逆序处理嵌套表格
			
 
				     for tbody_index in range(1,len(tbodies)+1):
			
@@ -1124,7 +1127,7 @@ def segment(soup,final=True):
 
				                 if "：" in punc_del.strip():
			
 
				                     text = re.sub(punc_del,"：",text)
			
 
				                 else:
			
 
				-                    text = re.sub(punc_del,punc_del.strip()[-1],text)
			
 
				+                    text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
			
 
				             else:
			
 
				                 text = re.sub(punc_del,"",text)
			
 
				         
			
@@ -1486,6 +1489,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         key_preprocess = "tableToText"
			
 
				         start_time = time.time()
			
 
				         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
			
 
				+        article_processed = article_processed.replace('．','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
			
 
				+        article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
			
 
				+        article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
			
 
				 
			
 
				         # 提取bidway
			
 
				         list_bidway = extract_bidway(article_processed, _title)
			
@@ -1667,6 +1673,21 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				 
			
 
				             ner_entitys = ner_entitys_all[sentence_index]
			
 
				 
			
 
				+            '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
			
 
				+            for it in re.finditer(
			
 
				+                    '(?P<text_key_word>[^，。、；《]{,5}(单一来源|中标|中选|中价|成交)?(供应商|供货商|服务商|候选人|单位|人)(名称)?为?[:：]+)(?P<text>([^，。、；《]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[，。]',
			
 
				+                    sentence_text):
			
 
				+                for k, v in it.groupdict().items():
			
 
				+                    if k == 'text_key_word':
			
 
				+                        keyword = v
			
 
				+                    if k == 'text':
			
 
				+                        entity = v
			
 
				+                b = it.start() + len(keyword)
			
 
				+                e = it.end() - 1
			
 
				+                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
			
 
				+                    ner_entitys.append((b, e, 'company', entity))
			
 
				+                    # print('正则新增 ：',(b, e, 'company', entity))
			
 
				+
			
 
				 
			
 
				             #识别package
			
 
				 
			
@@ -1718,9 +1739,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				             #                       "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)())",
			
 
				             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\)）]?)"}
			
 
				             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				-                                  "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果：)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条米]*))\s*[）\)]?))",
			
 
				+                                  "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				                                   "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?元)\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)())",
			
 
				-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千]*)[\(（]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				+                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				             # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。
			
 
				 
			
 
				             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
			
@@ -1879,13 +1900,27 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                         notes = '总投资'
			
 
				                     elif re.search('投资', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
			
 
				                         notes = '投资'
			
 
				+                    elif re.search('工程造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
			
 
				+                        notes = '工程造价'
			
 
				+                    elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
			
 
				+                          or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
			
 
				+                                       sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
			
 
				+                          or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
			
 
				+                                       sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
			
 
				+                        notes = '保证金'
			
 
				+                        # print('保证金信息：', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
			
 
				+                    elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
			
 
				+                                   sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
			
 
				+                        notes = '成本警戒线'
			
 
				                     elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为：]', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				                         cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
			
 
				                         notes = cost_re.group(1)
			
 
				-                    elif re.search('单价', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				+                    elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
			
 
				                         notes = '单价'
			
 
				                     elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
			
 
				                         notes = '大写'
			
 
				+                        if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
			
 
				+                            entity_text = "壹"+entity_text
			
 
				                         # print("补充备注：notes = 大写")
			
 
				                     if len(unit)>0:
			
 
				                         if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -52,44 +52,48 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				     cost_time.update(_cost_time)
			
 
				 
			
 
				     #依赖句子顺序
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() # 公告类型/生命周期提取
			
 
				     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
			
 
				     cost_time["channel"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() # 项目编号、名称提取
			
 
				     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
			
 
				     log("get codename done of doc_id%s"%(doc_id))
			
 
				     cost_time["codename"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() # 角色金额模型提取
			
 
				     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
			
 
				     log("get prem done of doc_id%s"%(doc_id))
			
 
				     cost_time["prem"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() # 产品名称及废标原因提取
			
 
				     predictor.getPredictor("product").predict(list_sentences,list_entitys)
			
 
				     log("get product done of doc_id%s"%(doc_id))
			
 
				     cost_time["product"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ； 项目、需求、预算、时间
			
 
				     product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
			
 
				     log("get product attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["product_attrs"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() #正则角色提取
			
 
				     predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
			
 
				     cost_time["rule"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() #联系人模型提取
			
 
				     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
			
 
				     log("get epc done of doc_id%s"%(doc_id))
			
 
				     cost_time["person"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() # 时间类别提取
			
 
				     predictor.getPredictor("time").predict(list_sentences, list_entitys)
			
 
				     log("get time done of doc_id%s"%(doc_id))
			
 
				     cost_time["time"] = round(time.time()-start_time,2)
			
 
				 
			
 
				+    start_time = time.time() # 保证金支付方式
			
 
				+    payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
			
 
				+    cost_time["deposit"] = round(time.time()-start_time,2)
			
 
				+
			
 
				     # 需在getPredictor("prem")后  getAttributes.getPREMs 前
			
 
				     if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None:
			
 
				         keyword = re.search('监理|设计|勘察', title).group(0)
			
@@ -105,13 +109,13 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				                         _entity.set_Money(1, _entity.values)
			
 
				 
			
 
				     #依赖句子顺序
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() #实体链接
			
 
				     entityLink.link_entitys(list_entitys)
			
 
				     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				     log("get attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["attrs"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				+    start_time = time.time() #失信数据要素提取
			
 
				     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
			
 
				     cost_time["punish"] = round(time.time()-start_time,2)
			
 
				 
			
@@ -121,10 +125,9 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				                 if product in d['project_name']:
			
 
				                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
			
 
				 
			
 
				-    #print(prem)
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
			
 
				+    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1], **payment_way_dic)
			
 
				     data_res["cost_time"] = cost_time
			
 
				     data_res["success"] = True
			
 
				 
			
@@ -167,9 +170,14 @@ if __name__=="__main__":
 
				     #     print(rs['product_attrs'])
			
 
				     # print(rs)
			
 
				 
			
 
				-    with open('D:/html/138786703.html', 'r', encoding='utf-8') as f:
			
 
				+    with open('D:/html/2.html', 'r', encoding='utf-8') as f:
			
 
				         text = f.read()
			
 
				+        t1 = time.time()
			
 
				+        print(predict('', text, title))
			
 
				+        t2 = time.time()
			
 
				         print(predict('', text, title))
			
 
				+        t3 = time.time()
			
 
				+        print('第一次耗时：%.4f, 第二次耗时：%.4f'%(t2-t1, t3-t2))
			
 
				     # print(predict('',text,title))
			
 
				 
			
 
				     # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -942,7 +942,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				                                     packageName_entity = "Project"
			
 
				                                 if str(entity.label) in ["2","3","4"]:
			
 
				                                     # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
			
 
				-                                    if entity_after.notes == '单价':
			
 
				+                                    if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值，避免203608823.html 两次金额一次万元没提取到的情况
			
 
				                                         addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
			
 
				                                                          0.5)
			
 
				                                         entity.pointer_money = entity_after
			
@@ -1989,7 +1989,30 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				     while(p_entity>=0):
			
 
				         entity = list_entity[p_entity]
			
 
				         if entity.entity_type=="money":
			
 
				-            if entity.values[entity.label]>=on_value:
			
 
				+            # 2021/12/03 添加成本警戒线、保证金
			
 
				+            if entity.notes in ['保证金', '成本警戒线']:
			
 
				+                packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
			
 
				+                                                   "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
			
 
				+                if packagePointer is None:
			
 
				+                    packageName = "Project"
			
 
				+                else:
			
 
				+                    packageName = packagePointer.entity_text
			
 
				+
			
 
				+                if packageName == "Project":
			
 
				+                    # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
			
 
				+                    #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
			
 
				+                    if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
			
 
				+                        PackDict["Project"]["bond"] = float(entity.entity_text)
			
 
				+                    elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
			
 
				+                        PackDict["Project"]["cost_warning"] = float(entity.entity_text)
			
 
				+
			
 
				+                else:
			
 
				+                    if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
			
 
				+                        PackDict[packageName]["bond"] = float(entity.entity_text)
			
 
				+                    elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
			
 
				+                        PackDict[packageName]["cost_warning"] = float(entity.entity_text)
			
 
				+
			
 
				+            elif entity.values[entity.label]>=on_value:
			
 
				                 if str(entity.label)=="1":
			
 
				                     set_tenderer_money.add(float(entity.entity_text))
			
 
				                     list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表，倒序保存所有中标金额
			
@@ -2181,7 +2204,8 @@ def getOtherAttributes(list_entity):
 
				                   "serviceTime":"",
			
 
				                   "product":[],
			
 
				                   "total_tendereeMoney":0,
			
 
				-                  "total_tendereeMoneyUnit":''}
			
 
				+                  "total_tendereeMoneyUnit":''
			
 
				+                   }
			
 
				     dict_time = {
			
 
				         "time_release": [],
			
 
				         "time_bidopen": [],
			
@@ -2211,8 +2235,8 @@ def getOtherAttributes(list_entity):
 
				         elif entity.entity_type=='product':
			
 
				             dict_other["product"].append(entity.entity_text)
			
 
				         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
			
 
				-            dict_other["total_tendereeMoney"] = float(entity.entity_text)
			
 
				-            dict_other["total_tendereeMoneyUnit"] = entity.money_unit
			
 
				+                dict_other["total_tendereeMoney"] = float(entity.entity_text)
			
 
				+                dict_other["total_tendereeMoneyUnit"] = entity.money_unit
			
 
				     # 时间类别
			
 
				     for time_type,value in dict_time.items():
			
 
				         list_time = dict_time[time_type]
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -35,7 +35,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
 
				                   "punish":{"predictor":None,"Lock":RLock()},
			
 
				                   "product":{"predictor":None,"Lock":RLock()},
			
 
				                 "product_attrs":{"predictor":None,"Lock":RLock()},
			
 
				-                  "channel": {"predictor": None, "Lock": RLock()}}
			
 
				+                  "channel": {"predictor": None, "Lock": RLock()},
			
 
				+                  "deposit_payment_way": {"predictor": None, "Lock": RLock()}}
			
 
				 
			
 
				 
			
 
				 def getPredictor(_type):
			
@@ -62,6 +63,8 @@ def getPredictor(_type):
 
				                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
			
 
				                 if _type == "channel":
			
 
				                     dict_predictor[_type]["predictor"] = DocChannel()
			
 
				+                if _type == 'deposit_payment_way':
			
 
				+                    dict_predictor[_type]["predictor"] = DepositPaymentWay()
			
 
				             return dict_predictor[_type]["predictor"]
			
 
				     raise NameError("no this type of predictor")
			
 
				 
			
@@ -542,6 +545,7 @@ class PREMPredict():
 
				             list_entitys:文章的entitys
			
 
				         @return:角色模型的输入数据
			
 
				         '''
			
 
				+        text_list = []
			
 
				         data_x = []
			
 
				         points_entitys = []
			
 
				         for list_entity,list_sentence in zip(list_entitys,list_sentences):
			
@@ -556,6 +560,7 @@ class PREMPredict():
 
				                     while(p_sentences<len(list_sentence)):
			
 
				                         sentence = list_sentence[p_sentences]
			
 
				                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
			
 
				+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
			
 
				                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
			
 
				                             item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
			
 
				                             data_x.append(item_x)
			
@@ -568,7 +573,7 @@ class PREMPredict():
 
				         if len(points_entitys)==0:
			
 
				             return None
			
 
				         
			
 
				-        return [data_x,points_entitys]
			
 
				+        return [data_x,points_entitys, text_list]
			
 
				     
			
 
				     
			
 
				     def search_money_data(self,list_sentences,list_entitys):
			
@@ -579,6 +584,7 @@ class PREMPredict():
 
				             list_entitys:文章的entitys
			
 
				         @return:金额模型的输入数据
			
 
				         '''
			
 
				+        text_list = []
			
 
				         data_x = []
			
 
				         points_entitys = []
			
 
				         for list_entity,list_sentence in zip(list_entitys,list_sentences):
			
@@ -594,6 +600,7 @@ class PREMPredict():
 
				                     while(p_sentences<len(list_sentence)):
			
 
				                         sentence = list_sentence[p_sentences]
			
 
				                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
			
 
				+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
			
 
				                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
			
 
				                             #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
			
 
				                             item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
			
@@ -606,7 +613,7 @@ class PREMPredict():
 
				         if len(points_entitys)==0:
			
 
				             return None
			
 
				         
			
 
				-        return [data_x,points_entitys]
			
 
				+        return [data_x,points_entitys, text_list]
			
 
				     
			
 
				     def predict_role(self,list_sentences, list_entitys):
			
 
				         datas = self.search_role_data(list_sentences, list_entitys)
			
@@ -614,6 +621,7 @@ class PREMPredict():
 
				         if datas is None:
			
 
				             return
			
 
				         points_entitys = datas[1]
			
 
				+        text_list = datas[2]
			
 
				 
			
 
				 
			
 
				         if USE_PAI_EAS:
			
@@ -641,17 +649,24 @@ class PREMPredict():
 
				         for i in range(len(predict_y)):
			
 
				             entity = points_entitys[i]
			
 
				             label = np.argmax(predict_y[i])
			
 
				-            values = []
			
 
				-            for item in predict_y[i]:
			
 
				-                values.append(item)
			
 
				-                entity.set_Role(label,values)
			
 
				-        
			
 
				+            values = predict_y[i]
			
 
				+            text = text_list[i]
			
 
				+            if label == 2:
			
 
				+                if re.search('中标单位和.{,25}签订合同', text):
			
 
				+                    label = 0
			
 
				+                    values[label] = 0.501
			
 
				+                elif re.search('尊敬的供应商：.{,25}我公司', text):
			
 
				+                    label = 0
			
 
				+                    values[label] = 0.801
			
 
				+            entity.set_Role(label, values)
			
 
				+
			
 
				     def predict_money(self,list_sentences,list_entitys):
			
 
				         datas = self.search_money_data(list_sentences, list_entitys)
			
 
				         if datas is None:
			
 
				             return
			
 
				         points_entitys = datas[1]
			
 
				         _data = datas[0]
			
 
				+        text_list = datas[2]
			
 
				         if USE_PAI_EAS:
			
 
				             _data = np.transpose(np.array(_data),(1,0,2,3))
			
 
				             request = tf_predict_pb2.PredictRequest()
			
@@ -677,7 +692,10 @@ class PREMPredict():
 
				             entity = points_entitys[i]
			
 
				             label = np.argmax(predict_y[i])
			
 
				             values = predict_y[i]
			
 
				-            if label ==0 and entity.notes=="投资":
			
 
				+            text = text_list[i]
			
 
				+            if label == 1 and re.search('[:：，。](总金额|总价|单价)', text):
			
 
				+                values[label] = 0.49
			
 
				+            elif label ==0 and entity.notes in ["投资", "工程造价"]:
			
 
				                 values[label] = 0.49
			
 
				             entity.set_Money(label, values)
			
 
				         
			
@@ -1065,17 +1083,17 @@ class FormPredictor():
 
				 class RoleRulePredictor():
			
 
				     
			
 
				     def __init__(self):
			
 
				-        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体|比选)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方|询价单位)(是|为|信息|：|:|\s*)$)"
			
 
				+        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|：|:|\s*)$)"
			
 
				         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
			
 
				         self.pattern_tenderee_right = "(?P<tenderee_right>^(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?))"  #|(^[^.。，,:：](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
			
 
				         
			
 
				         self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名，?称|全称|是|为|：|:|[,，]?\s*)$|(受.{,20}委托))"
			
 
				         self.pattern_agency_right = "(?P<agency_right>^(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.{,15}委托)"
			
 
				         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
			
 
				-        self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[：:是为]+$|(选定单位|指定的中介服务机构))[：:是为，]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[：:是为]+$|((评审结果|名次|排名)[:：]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|：|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[:：]$)"
			
 
				+        self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[：:是为]+$|(选定单位|指定的中介服务机构))[：:是为，]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[：:是为]+$|((评审结果|名次|排名)[:：]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|：|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[:：]$)"
			
 
				         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[：:是为])"
			
 
				-        self.pattern_winTenderer_right = "(?P<winTenderer_right>^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
			
 
				-        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果：由.{5,20}供货)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果：由.{5,20}供货
			
 
				+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低，确定为本项目成交供应商)"
			
 
				+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果：由.{5,20}供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果：由.{5,20}供货
			
 
				 
			
 
				         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[：:]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|：|:|\s*$)|((评审结果|名次|排名)[:：]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
			
 
				 
			
@@ -1193,7 +1211,7 @@ class RoleRulePredictor():
 
				                                                     _role = _group.split("_")[0]
			
 
				                                                     _direct = _group.split("_")[1]
			
 
				                                                     _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role)
			
 
				-                                                    if _i_span==0 and _direct=="left":
			
 
				+                                                    if _i_span==0 and _direct=="left" and '各供应商' not in _v_group: #2021/12/22 修正错误中标召回 例子208668937
			
 
				                                                         _flag = True
			
 
				                                                         _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
			
 
				                                                         list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
			
@@ -2186,6 +2204,39 @@ class DocChannel():
 
				       # return self.id2type[id], prob
			
 
				       return [{'docchannel':self.id2type[id]}]
			
 
				 
			
 
				+# 保证金支付方式提取
			
 
				+class DepositPaymentWay():
			
 
				+    def __init__(self,):
			
 
				+        self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[:：]*([^，。]{,60})'
			
 
				+        self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
			
 
				+        kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
			
 
				+               '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
			
 
				+               '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
			
 
				+               '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
			
 
				+        self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
			
 
				+
			
 
				+    def predict(self,content):
			
 
				+        pay_way = {'deposit_patment_way':''}
			
 
				+        result = []
			
 
				+        pay = re.search(self.pt, content)
			
 
				+        if pay:
			
 
				+            # print(pay.group(0))
			
 
				+            pay = pay.group(3)
			
 
				+            for it in re.finditer('|'.join(self.kws), pay):
			
 
				+                result.append(it.group(0))
			
 
				+            pay_way['deposit_patment_way'] = '；'.join(result)
			
 
				+            return pay_way
			
 
				+        pay = re.search(self.pt2, content)
			
 
				+        if pay:
			
 
				+            # print(pay.group(0))
			
 
				+            pay = pay.group(2)
			
 
				+            for it in re.finditer('|'.join(self.kws), pay):
			
 
				+                result.append(it.group(0))
			
 
				+            pay_way['deposit_patment_way'] = '；'.join(result)
			
 
				+            return pay_way
			
 
				+        else:
			
 
				+            return pay_way
			
 
				+
			
 
				 def getSavedModel():
			
 
				     #predictor = FormPredictor()
			
 
				     graph = tf.Graph()