3 tahun lalu · 1c3a9cfc60
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (py3.5)" project-jdk-type="Python SDK" />
			
 
				+  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" project-jdk-type="Python SDK" />
			
 
				   <component name="PythonCompatibilityInspectionAdvertiser">
			
 
				     <option name="version" value="3" />
			
 
				   </component>
			
--- a/BiddingKG.iml
+++ b/BiddingKG.iml
@@ -7,7 +7,7 @@
 
				   </component>
			
 
				   <component name="NewModuleRootManager">
			
 
				     <content url="file://$MODULE_DIR$" />
			
 
				-    <orderEntry type="jdk" jdkName="Python 3.5 (py3.5)" jdkType="Python SDK" />
			
 
				+    <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
			
 
				     <orderEntry type="sourceFolder" forTests="false" />
			
 
				     <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
			
 
				   </component>
			
--- a/BiddingKG/dl/channel/re_channel_103.py
+++ b/BiddingKG/dl/channel/re_channel_103.py
@@ -0,0 +1,219 @@
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+# 各投标人
			
 
				+# 各潜在投标人
			
 
				+# 各潜在投标人：
			
 
				+# 致各招标文件持有者：
			
 
				+# 致各投标人
			
 
				+# 各潜在投标供应商：
			
 
				+
			
 
				+# 修改、澄清(答疑)纪要内容如下: 1、
			
 
				+# 答疑澄清与修改的主要内容：
			
 
				+# 对文件澄清与修改的主要内容
			
 
				+# 澄清、修改内容要点
			
 
				+# 答疑纪要
			
 
				+# 答疑如下
			
 
				+# 招标文件答疑和招标文件修改通知
			
 
				+# 招标文件答疑通知
			
 
				+# 答疑及补遗通知
			
 
				+# 答疑回复如下：
			
 
				+# 现对投标人提出的质疑回复如下：
			
 
				+# 对文件澄清与修改的主要内容 详见招标文件
			
 
				+# 修改的主要内容 详见附件
			
 
				+# 澄清或修改事项：
			
 
				+
			
 
				+# 第1次答疑
			
 
				+# 第1次答疑澄清
			
 
				+
			
 
				+# 答疑补遗文件
			
 
				+# 补遗书澄清文件 答疑澄清
			
 
				+# 质疑1
			
 
				+# 问题
			
 
				+# 答疑文件1
			
 
				+# 具体补遗内容详见附件
			
 
				+# 请问 答
			
 
				+# 问题 回复
			
 
				+# 答疑澄清公告 1：
			
 
				+# 现对招标文件作如下澄清：
			
 
				+# 详见答疑澄清文件
			
 
				+# 详见答疑文件。
			
 
				+
			
 
				+
			
 
				+channel_103 = '(澄清|答疑|补遗|修改)'
			
 
				+channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|：)'
			
 
				+channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
			
 
				+                '(如下|[：:]|详见|点击下载附件|[1一][:：、]|（1）|\\(1\\)|一)'
			
 
				+channel_103_2 = '第(.?)次(答疑|澄清)'
			
 
				+channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
			
 
				+channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[:：]|[一1][:：、]|（1）|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
			
 
				+channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[:：]'
			
 
				+channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
			
 
				+
			
 
				+# 答疑澄清时间
			
 
				+# 对文件澄清与修改的主要内容 无澄清文件
			
 
				+# 对文件澄清与修改的主要内容 无
			
 
				+# 请各投标单位自行下载
			
 
				+not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
			
 
				+
			
 
				+
			
 
				+def re_standard_channel_103(_str):
			
 
				+    channel_103_list = []
			
 
				+
			
 
				+    if not re.search(channel_103, _str):
			
 
				+        print("not")
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_0 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("0", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_1 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("1", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_2 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
			
 
				+            channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("2", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_3 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
			
 
				+            channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("3", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_4 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("4", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_5 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("5", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    return channel_103_list
			
 
				+
			
 
				+
			
 
				+def re_not_channel_103(_str):
			
 
				+    match = re.findall(not_channel_103, _str)
			
 
				+    if match:
			
 
				+        for word in match:
			
 
				+            instead = "#" * len(word)
			
 
				+            _str = re.sub(word, instead, _str)
			
 
				+    return _str
			
 
				+
			
 
				+
			
 
				+def re_channel_103(text):
			
 
				+    # 替换易混淆词
			
 
				+    clean_text = re_not_channel_103(text)
			
 
				+
			
 
				+    # 查找符合标准形式的
			
 
				+    channel_103_list = re_standard_channel_103(clean_text)
			
 
				+    return channel_103_list
			
 
				+
			
 
				+
			
 
				+def extract_channel_103(text):
			
 
				+    result_list = []
			
 
				+    channel_103_list = re_channel_103(text)
			
 
				+    if channel_103_list:
			
 
				+        for word, text_index in channel_103_list:
			
 
				+            if word is not None:
			
 
				+                if text_index[1]-text_index[0] != len(word) \
			
 
				+                        or text_index[1]-text_index[0] >= 20:
			
 
				+                    return []
			
 
				+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
			
 
				+                result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    predict_list = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        word_list = re_channel_103(row["doctextcon"], "")
			
 
				+        if word_list:
			
 
				+            predict = word_list
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict", predict)
			
 
				+        predict_list.append(str(predict))
			
 
				+
			
 
				+    predict_df = pd.DataFrame(predict_list)
			
 
				+    df = pd.concat([df, predict_df], axis=1)
			
 
				+
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!")
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '''
			
 
				+    (第1次澄清） 发布时间：2020-11-25 致各招标文件持有者： 招标人──舟山市
			
 
				+    '''
			
 
				+    print(extract_channel_103(s))
			
 
				+
			
 
				+
			
 
				+def test_html():
			
 
				+    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_channel_103(s, title=""))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
			
 
				+    # test_csv(path)
			
 
				+    test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -62,6 +62,15 @@ def link_entitys(list_entitys,on_value=0.8):
 
				                         if len(_ent.entity_text)>len(_entity.entity_text):
			
 
				                             _entity.entity_text = _ent.entity_text
			
 
				 
			
 
				+        # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
			
 
				+        for _entity in range_entity:
			
 
				+            for _ent in _entity.linked_entitys:
			
 
				+                # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
			
 
				+                if re.search("公司$", _ent.entity_text) is not None \
			
 
				+                        and _ent.if_dict_match == 1:
			
 
				+                    if len(_ent.entity_text) > len(_entity.entity_text):
			
 
				+                        _entity.entity_text = _ent.entity_text
			
 
				+
			
 
				 
			
 
				 def getEnterprisePath():
			
 
				     filename = "../LEGAL_ENTERPRISE.txt"
			
@@ -146,6 +155,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				         for p_sentence in list_sentence:
			
 
				             sentence = p_sentence.sentence_text
			
 
				             list_match = match_enterprise_max_first(sentence)
			
 
				+            # print("list_match", list_match)
			
 
				 
			
 
				             doc_id = p_sentence.doc_id
			
 
				             sentence_index = p_sentence.sentence_index
			
@@ -164,10 +174,14 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                     if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
			
 
				                         find_flag = True
			
 
				                         p_entity.entity_type = "company"
			
 
				+                        p_entity.if_dict_match = 1
			
 
				 
			
 
				                     if p_entity.entity_type not in ["location","org","company"]:
			
 
				                         continue
			
 
				 
			
 
				+                    if _match["entity_text"] == p_entity.entity_text:
			
 
				+                        p_entity.if_dict_match = 1
			
 
				+
			
 
				                     #有重叠
			
 
				                     #match部分被包含则不处理
			
 
				                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
			
@@ -189,6 +203,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             p_entity.wordOffset_end = _match["end_index"]
			
 
				                             p_entity.begin_index = begin_index
			
 
				                             p_entity.end_index = end_index
			
 
				+                            # 该公司实体是字典识别的
			
 
				+                            p_entity.if_dict_match = 1
			
 
				 
			
 
				                             for _match_h in range(_match_index+1,_match_j+1):
			
 
				                                 entity_text = list_match[_match_h]["entity_text"]
			
@@ -198,6 +214,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
			
 
				                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
 
				                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
			
 
				+                                add_entity.if_dict_match = 1
			
 
				                                 list_entity.append(add_entity)
			
 
				 
			
 
				                                 range_entity.append(add_entity)
			
@@ -225,6 +242,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                     p_entity.wordOffset_end = _match["end_index"]
			
 
				                                     p_entity.begin_index = begin_index
			
 
				                                     p_entity.end_index = end_index
			
 
				+                                    p_entity.if_dict_match = 1
			
 
				                         elif _match["end_index"]>=p_entity.wordOffset_end:
			
 
				                             match_replace = True
			
 
				                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
@@ -236,6 +254,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             p_entity.begin_index = begin_index
			
 
				                             p_entity.end_index = end_index
			
 
				                             p_entity.entity_type = "company"
			
 
				+                            p_entity.if_dict_match = 1
			
 
				                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
			
 
				                         find_flag = True
			
 
				                         if p_entity.entity_type in ("org","company"):
			
@@ -248,12 +267,12 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             p_entity.wordOffset_end = _match["end_index"]
			
 
				                             p_entity.begin_index = begin_index
			
 
				                             p_entity.end_index = end_index
			
 
				+                            p_entity.if_dict_match = 1
			
 
				                 if not find_flag:
			
 
				                     match_add = True
			
 
				                     entity_text = _match["entity_text"]
			
 
				                     entity_type = "company"
			
 
				 
			
 
				-
			
 
				                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
--- a/BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py
+++ b/BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py
@@ -0,0 +1,231 @@
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+# 申请人可以组成联合体报名，联合体的家数最多不能超过两家
			
 
				+# 本项目不接受供应商以联合体方式进行投标。
			
 
				+
			
 
				+bidway = '(参与|)(报价|投标|招标|竞价|报名|参加|资格预审|应答|谈判|磋商|竞标)(登记|)'
			
 
				+
			
 
				+# 是否接收联合体投标： 不接受
			
 
				+# 联合体投标: 不允许
			
 
				+# 是否允许联合体投标登记：是
			
 
				+# （是/否）接受联合体投标：否
			
 
				+# 是否接受联合体投标 不接受
			
 
				+# 是否接受联合体投标:不接受
			
 
				+# 本项目（是/否）接受联合体投标：否
			
 
				+# joint_bidding_prefix_1 = '(不[ ]?|[(（]{0,1}[ ]?[是否不][ ]?[）)]{0,1}|)'
			
 
				+joint_bidding_prefix_1 = "(是否|)"
			
 
				+bidway_1 = bidway
			
 
				+joint_bidding_body_1 = '(允许|接受|接收|)(联合体|独立体或联合体)' + bidway_1
			
 
				+joint_bidding_suffix_1 = '([ :：。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
			
 
				+
			
 
				+# 不接受(接受或不接受)联合体投标
			
 
				+# （否）接受联合体。
			
 
				+# （不）接受联合体投标
			
 
				+# （ 不 ）接受联合体。
			
 
				+# 本项目 不 允许联合体投标。
			
 
				+# （否）接受联合体投标
			
 
				+# 本项目不接受联合体参与投标。
			
 
				+# 本合同包接受联合体投标
			
 
				+# 本项目不接受联合体应答，
			
 
				+# 不接受联合体投标
			
 
				+# 否 接受联合体
			
 
				+# 接受 联合体资格预审
			
 
				+# 接受独立体或联合体报名，联合体的家数最多不能超过两家
			
 
				+joint_bidding_prefix_2 = '(不[ ]?|[(（]{0,1}[ ]?[是否不][ ]?[）)]{0,1}|)'
			
 
				+bidway_2 = "(" + bidway + "|)"
			
 
				+joint_bidding_body_2 = '(允许|接受|接收).?(联合体|独立体或联合体)' + bidway_2
			
 
				+joint_bidding_suffix_2 = '([ :：。]{0,2})(不接受|不接收|不允许|允许|接受|接收|是|否|)'
			
 
				+# joint_bidding_suffix_2 = ""
			
 
				+
			
 
				+# 是否允许联合体 不允许
			
 
				+joint_bidding_prefix_3 = '(是否)'
			
 
				+joint_bidding_body_3 = '(允许|接受|接收).?(联合体|独立体或联合体)'
			
 
				+joint_bidding_suffix_3 = '([ :：。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
			
 
				+
			
 
				+
			
 
				+# 是否接受联合体投标：（ ）是（√ ）否。
			
 
				+
			
 
				+
			
 
				+# 投标人须知前附表规定接受联合体投标的
			
 
				+# 联合体投标的，
			
 
				+# 允许联合体投标的
			
 
				+# 如项目接受联合体投标
			
 
				+# （是/否）接受联合体投标: 是 否
			
 
				+# 招标□接受 ?不接受联合体投标
			
 
				+# 联合体投标：接受；不接受
			
 
				+# （是/否）
			
 
				+# 是 否
			
 
				+# 接受；不接受
			
 
				+# 接受 ?不接受
			
 
				+# (接受或不接受)
			
 
				+# 是否允许联合体： 1 是 0 否
			
 
				+# 允许联合体报名 □是 ■ 否
			
 
				+not_joint_bidding_1 = '(' \
			
 
				+                      '联合体投标的|如项目接受联合体投标' \
			
 
				+                      '|是否允许联合体： 1 是 0 否' \
			
 
				+                      '|联合体参加的|联合体牵头人|联合体牵头方|联合体成员|联合体（牵头人）' \
			
 
				+                      '|联合体各方|联合体协议' \
			
 
				+                      '|允许联合体报名 □是 ■ 否' \
			
 
				+                      ')'
			
 
				+not_joint_bidding_2 = '(' \
			
 
				+                      '[(（]{0,1}.?是.{1,2}否[)）]{0,1}' \
			
 
				+                      '|[(（]{0,1}.?接受.{0,2}不接受[)）]{0,1}' \
			
 
				+                      '|1 是 0 否' \
			
 
				+                      '|.{1}接受.{1,2}不接受' \
			
 
				+                      ')'
			
 
				+
			
 
				+
			
 
				+def re_not_joint_bidding(_str):
			
 
				+    _str = re.sub(not_joint_bidding_1, "", _str)
			
 
				+    _str = re.sub(not_joint_bidding_2, "", _str)
			
 
				+    return _str
			
 
				+
			
 
				+
			
 
				+def re_standard_joint_bidding(_str):
			
 
				+    # 第一种形式
			
 
				+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_1 + ")" \
			
 
				+                   + "(?P<body>" + joint_bidding_body_1 + ")" \
			
 
				+                   + "(?P<suffix>" + joint_bidding_suffix_1 + ")"
			
 
				+    # print("prefix", re.findall(joint_bidding_prefix_1, _str))
			
 
				+    # print("body", re.search(joint_bidding_body_1, _str))
			
 
				+    # print("suffix", re.search(joint_bidding_suffix_1, _str))
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    joint_bidding_list = []
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword = ""
			
 
				+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
			
 
				+        joint_bidding_list.append([keyword, m_span[0], m_span[1]])
			
 
				+    if joint_bidding_list:
			
 
				+        return joint_bidding_list
			
 
				+
			
 
				+    # 第二种形式
			
 
				+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_2 + ")" \
			
 
				+                   + "(?P<body>" + joint_bidding_body_2 + ")" \
			
 
				+                   + "(?P<suffix>" + joint_bidding_suffix_2 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
			
 
				+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
			
 
				+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
			
 
				+    joint_bidding_list = []
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword = ""
			
 
				+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
			
 
				+        # 排除 '是否' 的情况
			
 
				+        if _str[m_span[0]-1:m_span[0]] != "是":
			
 
				+            joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
			
 
				+    if joint_bidding_list:
			
 
				+        return joint_bidding_list
			
 
				+
			
 
				+    # 第三种形式
			
 
				+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_3 + ")" \
			
 
				+                   + "(?P<body>" + joint_bidding_body_3 + ")" \
			
 
				+                   + "(?P<suffix>" + joint_bidding_suffix_3 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
			
 
				+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
			
 
				+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
			
 
				+    joint_bidding_list = []
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword = ""
			
 
				+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
			
 
				+        joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
			
 
				+    if joint_bidding_list:
			
 
				+        return joint_bidding_list
			
 
				+
			
 
				+    return joint_bidding_list
			
 
				+
			
 
				+
			
 
				+def re_joint_bidding(text):
			
 
				+    # 替换小括号
			
 
				+    text_clean = re.sub("\\(", "（", text)
			
 
				+    text_clean = re.sub("\\)", "）", text_clean)
			
 
				+
			
 
				+    # 替换易混淆词
			
 
				+    text_clean = re_not_joint_bidding(text_clean)
			
 
				+    # print("clean", text_clean)
			
 
				+
			
 
				+    # 查找符合标准形式的
			
 
				+    joint_bidding_list = re_standard_joint_bidding(text_clean)
			
 
				+    return joint_bidding_list
			
 
				+
			
 
				+
			
 
				+def judge_joint_bidding(_list):
			
 
				+    new_list = []
			
 
				+    for l in _list:
			
 
				+        if "否" in l[0] or "不" in l[0]:
			
 
				+            new_list.append(["0" + " " + l[0], l[1]])
			
 
				+        else:
			
 
				+            new_list.append(["1" + " " + l[0], l[1]])
			
 
				+
			
 
				+    return new_list
			
 
				+
			
 
				+
			
 
				+def extract_joint_bidding(text):
			
 
				+    result_list = []
			
 
				+    joint_bidding_list = re_joint_bidding(text)
			
 
				+    joint_bidding_list = judge_joint_bidding(joint_bidding_list)
			
 
				+    if joint_bidding_list:
			
 
				+        for word, text_index in joint_bidding_list:
			
 
				+            if word is not None:
			
 
				+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
			
 
				+                result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    predict_list = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        word_list = re_joint_bidding(row["doctextcon"])
			
 
				+        if word_list:
			
 
				+            predict = word_list
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict", predict)
			
 
				+        predict_list.append(str(predict))
			
 
				+
			
 
				+    predict_df = pd.DataFrame(predict_list)
			
 
				+    df = pd.concat([df, predict_df], axis=1)
			
 
				+
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!")
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    # （不）接受联合体投标
			
 
				+    # 本项目不接受供应商以联合体方式进行投标。
			
 
				+    # （否）接受联合体。
			
 
				+    # 是否接收联合体投标： 不接受
			
 
				+    # 联合体投标: 不允许
			
 
				+    # 是否允许联合体投标登记：是
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '''
			
 
				+测绘服务 是否允许联合体 不允许 行业
			
 
				+    '''
			
 
				+    print(extract_joint_bidding(s))
			
 
				+
			
 
				+
			
 
				+def test_html(_path):
			
 
				+    html_path = _path
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_joint_bidding(s, title=""))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\投标工期_result.csv"
			
 
				+    test_csv(path)
			
 
				+    # test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -168,7 +168,10 @@ class Entity():
 
				         self.pointer_email = None
			
 
				         self.is_tail = False
			
 
				         self.notes = ''  # 2021/7/20 新增，保存金额大小写，单位等备注
			
 
				-        self.money_unit = '' #2021/8/17 新增，保存金额单位 元、万元 、亿元
			
 
				+        self.money_unit = ''  # 2021/8/17 新增，保存金额单位 元、万元 、亿元
			
 
				+        self.if_dict_match = 0  # 2021/12/21 新增，判断公司实体是否由字典识别得到
			
 
				+        self.is_total_money = 0  # 2021/12/29 新增，判断金额是否总价
			
 
				+        self.is_unit_money = 0  # 2021/12/29 新增，判断金额是否单价
			
 
				 
			
 
				     def set_Role(self,role_label,role_values):
			
 
				         self.label = int(role_label)
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -6,6 +6,9 @@ import sys
 
				 import os
			
 
				 import time
			
 
				 import codecs
			
 
				+
			
 
				+from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				+
			
 
				 sys.setrecursionlimit(1000000)
			
 
				 sys.path.append(os.path.abspath("../.."))
			
 
				 sys.path.append(os.path.abspath(".."))
			
@@ -1083,8 +1086,8 @@ def segment(soup,final=True):
 
				     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
			
 
				     # 感叹号替换为中文句号
			
 
				     text = re.sub("(?<=[\u4e00-\u9fa5])[!！]|[!！](?=[\u4e00-\u9fa5])","。",text)
			
 
				-    #替换"？"为 " " ,update:2021/7/20
			
 
				-    text = re.sub("？"," ",text)
			
 
				+    #替换格式未识别的问号为" " ,update:2021/7/20
			
 
				+    text = re.sub("[？\?]{2,}"," ",text)
			
 
				 
			
 
				 
			
 
				     #替换"""为"“",否则导入deepdive出错
			
@@ -2106,6 +2109,31 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				             #         Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				             #                begin_index_temp, end_index_temp))
			
 
				 
			
 
				+            # 2021/12/29 新增比率提取
			
 
				+            list_ratio = extract_ratio(sentence_text)
			
 
				+            entity_type = "ratio"
			
 
				+            for ratio in list_ratio:
			
 
				+                # print("ratio", ratio)
			
 
				+                begin_index_temp = ratio['begin_index']
			
 
				+                for j in range(len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] == begin_index_temp:
			
 
				+                        begin_index = j
			
 
				+                        break
			
 
				+                    elif list_tokenbegin[j] > begin_index_temp:
			
 
				+                        begin_index = j - 1
			
 
				+                        break
			
 
				+                index = ratio['end_index']
			
 
				+                end_index_temp = index
			
 
				+                for j in range(begin_index, len(list_tokenbegin)):
			
 
				+                    if list_tokenbegin[j] >= index:
			
 
				+                        end_index = j - 1
			
 
				+                        break
			
 
				+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
			
 
				+                entity_text = ratio['body']
			
 
				+                list_sentence_entitys.append(
			
 
				+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
			
 
				+                           begin_index_temp, end_index_temp))
			
 
				+
			
 
				             list_sentence_entitys.sort(key=lambda x:x.begin_index)
			
 
				             list_entitys_temp = list_entitys_temp+list_sentence_entitys
			
 
				         list_entitys.append(list_entitys_temp)
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -24,10 +24,11 @@ import BiddingKG.dl.interface.Preprocessing as Preprocessing
 
				 import BiddingKG.dl.interface.getAttributes as getAttributes
			
 
				 import BiddingKG.dl.complaint.punish_predictor as punish_rule
			
 
				 import json
			
 
				+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
			
 
				+from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				 
			
 
				 
			
 
				-
			
 
				-#自定义jsonEncoder
			
 
				+# 自定义jsonEncoder
			
 
				 class MyEncoder(json.JSONEncoder):
			
 
				     def default(self, obj):
			
 
				         if isinstance(obj, np.ndarray):
			
@@ -113,8 +114,13 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				                         _entity.values[1] = 0.51
			
 
				                         _entity.set_Money(1, _entity.values)
			
 
				 
			
 
				-    #依赖句子顺序
			
 
				-    start_time = time.time() #实体链接
			
 
				+    # 2021-12-29新增：提取：总价,单价
			
 
				+    start_time = time.time()  # 总价单价提取
			
 
				+    predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
			
 
				+    cost_time["total_unit_money"] = round(time.time()-start_time, 2)
			
 
				+
			
 
				+    # 依赖句子顺序
			
 
				+    start_time = time.time()  # 实体链接
			
 
				     entityLink.link_entitys(list_entitys)
			
 
				     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				     log("get attributes done of doc_id%s"%(doc_id))
			
@@ -138,6 +144,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				 
			
 
				     # for _article in list_articles:
			
 
				     #     log(_article.content)
			
 
				+    #
			
 
				     # for list_entity in list_entitys:
			
 
				     #     for _entity in list_entity:
			
 
				     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
			
@@ -145,7 +152,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				     #                str(_entity.begin_index),str(_entity.end_index)))
			
 
				 
			
 
				     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
			
 
				-    # return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False), list_articles[0].content, list_entitys[0]
			
 
				+
			
 
				 
			
 
				 def test(name,content):
			
 
				     user = {
			
@@ -174,12 +181,12 @@ if __name__=="__main__":
 
				     #     print(rs['product_attrs'])
			
 
				     # print(rs)
			
 
				 
			
 
				-    with open('../test/data/2.html', 'r', encoding='utf-8') as f: #D:/html
			
 
				+    with open('D:/html/2.html', 'r', encoding='utf-8') as f:
			
 
				         text = f.read()
			
 
				         t1 = time.time()
			
 
				         print(predict('', text, title))
			
 
				         t2 = time.time()
			
 
				-        # print(predict('', text, title))
			
 
				+        print(predict('', text, title))
			
 
				         t3 = time.time()
			
 
				         print('第一次耗时：%.4f, 第二次耗时：%.4f'%(t2-t1, t3-t2))
			
 
				     # print(predict('',text,title))
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1129,10 +1129,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				                 if not re.search("电，?话", phone_left):
			
 
				                     last_phone_mask = False
			
 
				                     continue
			
 
				-            if re.search("注册[证号]|帐，?号|编，?[号码]|报，?价|证，?号|价，?格|[\(\（]万?元[\)\）]|[a-zA-Z]+\d*$", phone_left):
			
 
				+            if re.search("注册[证号]|帐，?号|编，?[号码]|报，?价|标，?价|证，?号|价，?格|[\(\（]万?元[\)\）]|[a-zA-Z]+\d*$", phone_left):
			
 
				                 last_phone_mask = False
			
 
				                 continue
			
 
				-            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
			
 
				+            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
			
 
				                 last_phone_mask = False
			
 
				                 continue
			
 
				             # if:上一个phone实体不符合条件
			
@@ -2195,22 +2195,247 @@ def turnBidWay(bidway):
 
				     else:
			
 
				         return "其他"
			
 
				 
			
 
				+my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
			
 
				+import time
			
 
				+def my_timeFormat(_time):
			
 
				+    current_year = time.strftime("%Y",time.localtime())
			
 
				+    all_match = re.finditer(my_time_format_pattern,_time)
			
 
				+    time_list = []
			
 
				+    for _match in all_match:
			
 
				+        if len(_match.group())>0:
			
 
				+            legal = True
			
 
				+            year = ""
			
 
				+            month = ""
			
 
				+            day = ""
			
 
				+            for k,v in _match.groupdict().items():
			
 
				+                if k=="year":
			
 
				+                    year = v
			
 
				+                if k=="month":
			
 
				+                    month = v
			
 
				+                if k=="day":
			
 
				+                    day = v
			
 
				+            if year!="":
			
 
				+                if len(year)==2:
			
 
				+                    year = "20"+year
			
 
				+                if int(year)>int(current_year):
			
 
				+                    legal = False
			
 
				+            else:
			
 
				+                legal = False
			
 
				+            if month!="":
			
 
				+                if int(month)>12:
			
 
				+                    legal = False
			
 
				+            else:
			
 
				+                legal = False
			
 
				+            if day!="":
			
 
				+                if int(day)>31:
			
 
				+                    legal = False
			
 
				+            else:
			
 
				+                legal = False
			
 
				+            if legal:
			
 
				+                # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
			
 
				+                time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
			
 
				+    return time_list
			
 
				+
			
 
				+def getTimeAttributes(list_entity,list_sentence):
			
 
				+    time_entitys = [i for i in list_entity if i.entity_type=='time']
			
 
				+    time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
			
 
				+    list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
			
 
				+    dict_time = {
			
 
				+        "time_release": [],
			
 
				+        "time_bidopen": [],
			
 
				+        "time_bidclose": [],
			
 
				+        'time_bidstart': [],  # 12 投标（开始）时间、响应文件接收（开始）时间
			
 
				+
			
 
				+        'time_publicityStart': [],  # 4 公示开始时间（公示时间、公示期）
			
 
				+        'time_publicityEnd': [],  # 5 公示截止时间
			
 
				+        'time_getFileStart': [],  # 6 文件获取开始时间（文件获取时间）
			
 
				+        'time_getFileEnd': [],  # 7 文件获取截止时间
			
 
				+        'time_registrationStart': [],  # 8 报名开始时间（报名时间）
			
 
				+        'time_registrationEnd': [],  # 9 报名截止时间
			
 
				+        'time_earnestMoneyStart': [], #10 保证金递交开始时间（保证金递交时间）
			
 
				+        'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
			
 
				+        'time_commencement':[] , #13 开工日期
			
 
				+        'time_completion': []  # 14 竣工日期
			
 
				+    }
			
 
				+    last_sentence_index = 0
			
 
				+    last_time_type = ""
			
 
				+    last_time_index = {
			
 
				+        'time_bidstart':"time_bidclose",
			
 
				+        'time_publicityStart':"time_publicityEnd",
			
 
				+        'time_getFileStart':"time_getFileEnd",
			
 
				+        'time_registrationStart':"time_registrationEnd",
			
 
				+        'time_earnestMoneyStart':"time_earnestMoneyEnd",
			
 
				+        'time_commencement':"time_completion",
			
 
				+    }
			
 
				+    for entity in time_entitys:
			
 
				+        sentence_text = list_sentence[entity.sentence_index].sentence_text
			
 
				+        entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
			
 
				+        entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
			
 
				+        label_prob = entity.values[entity.label]
			
 
				+        entity_text = entity.entity_text
			
 
				+        extract_time = my_timeFormat(entity_text)
			
 
				+        if extract_time:
			
 
				+            if re.search("至|到", entity_left):
			
 
				+                if entity.sentence_index == last_sentence_index:
			
 
				+                    time_type = last_time_index.get(last_time_type)
			
 
				+                    if time_type:
			
 
				+                        dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10))
			
 
				+                        last_time_type = ""
			
 
				+                        continue
			
 
				+            if entity.label!=0:
			
 
				+                if entity.label==1 and label_prob>0.5:
			
 
				+                    dict_time['time_release'].append((extract_time[0],label_prob))
			
 
				+                    last_time_type = 'time_release'
			
 
				+                elif entity.label==2 and label_prob>0.5:
			
 
				+                    dict_time['time_bidopen'].append((extract_time[0],label_prob))
			
 
				+                    last_time_type = 'time_bidopen'
			
 
				+                elif entity.label==3 and label_prob>0.5:
			
 
				+                    dict_time['time_bidclose'].append((extract_time[0],label_prob))
			
 
				+                    last_time_type = 'time_bidclose'
			
 
				+                elif entity.label==12 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_bidclose'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_bidclose'
			
 
				+                        else:
			
 
				+                            dict_time['time_bidstart'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_bidstart'
			
 
				+                    else:
			
 
				+                        dict_time['time_bidstart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_bidclose'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==4 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_publicityEnd'
			
 
				+                        else:
			
 
				+                            dict_time['time_publicityStart'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_publicityStart'
			
 
				+                    else:
			
 
				+                        dict_time['time_publicityStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==5 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
			
 
				+                        last_time_type = 'time_publicityEnd'
			
 
				+                    else:
			
 
				+                        dict_time['time_publicityStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==6 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_getFileEnd'
			
 
				+                        else:
			
 
				+                            dict_time['time_getFileStart'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_getFileStart'
			
 
				+                    else:
			
 
				+                        dict_time['time_getFileStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==7 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
			
 
				+                        last_time_type = 'time_getFileEnd'
			
 
				+                    else:
			
 
				+                        dict_time['time_getFileStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==8 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_registrationEnd'
			
 
				+                        else:
			
 
				+                            dict_time['time_registrationStart'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_registrationStart'
			
 
				+                    else:
			
 
				+                        dict_time['time_registrationStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==9 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
			
 
				+                        last_time_type = 'time_registrationEnd'
			
 
				+                    else:
			
 
				+                        dict_time['time_registrationStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==10 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_earnestMoneyEnd'
			
 
				+                        else:
			
 
				+                            dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_earnestMoneyStart'
			
 
				+                    else:
			
 
				+                        dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==11 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
			
 
				+                        last_time_type = 'time_earnestMoneyEnd'
			
 
				+                    else:
			
 
				+                        dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==13 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_completion'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_completion'
			
 
				+                        else:
			
 
				+                            dict_time['time_commencement'].append((extract_time[0], label_prob))
			
 
				+                            last_time_type = 'time_commencement'
			
 
				+                    else:
			
 
				+                        dict_time['time_commencement'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_completion'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                elif entity.label==14 and label_prob>0.5:
			
 
				+                    if len(extract_time)==1:
			
 
				+                        dict_time['time_completion'].append((extract_time[0], label_prob))
			
 
				+                        last_time_type = 'time_completion'
			
 
				+                    else:
			
 
				+                        dict_time['time_commencement'].append((extract_time[0],label_prob))
			
 
				+                        dict_time['time_completion'].append((extract_time[1],label_prob))
			
 
				+                        last_time_type = ''
			
 
				+                else:
			
 
				+                    last_time_type = ""
			
 
				+            else:
			
 
				+                last_time_type = ""
			
 
				+        else:
			
 
				+            last_time_type = ""
			
 
				+        last_sentence_index = entity.sentence_index
			
 
				+
			
 
				+
			
 
				+    result_dict = dict((key,"") for key in dict_time.keys())
			
 
				+    for time_type,value in dict_time.items():
			
 
				+        list_time = dict_time[time_type]
			
 
				+        if list_time:
			
 
				+            list_time.sort(key=lambda x:x[1],reverse=True)
			
 
				+            result_dict[time_type] = list_time[0][0]
			
 
				+    return result_dict
			
 
				+
			
 
				 def getOtherAttributes(list_entity):
			
 
				     dict_other = {"moneysource":"",
			
 
				                   "person_review":[],
			
 
				-                  "time_release":"",
			
 
				-                  "time_bidopen":"",
			
 
				-                  "time_bidclose":"",
			
 
				+                  # "time_release":"",
			
 
				+                  # "time_bidopen":"",
			
 
				+                  # "time_bidclose":"",
			
 
				                   "serviceTime":"",
			
 
				                   "product":[],
			
 
				                   "total_tendereeMoney":0,
			
 
				-                  "total_tendereeMoneyUnit":''
			
 
				-                   }
			
 
				-    dict_time = {
			
 
				-        "time_release": [],
			
 
				-        "time_bidopen": [],
			
 
				-        "time_bidclose": []
			
 
				-    }
			
 
				+                  "total_tendereeMoneyUnit":''}
			
 
				+    # dict_time = {
			
 
				+    #     "time_release": [],
			
 
				+    #     "time_bidopen": [],
			
 
				+    #     "time_bidclose": []
			
 
				+    # }
			
 
				     for entity in list_entity:
			
 
				         if entity.entity_type == 'bidway':
			
 
				             dict_other["bidway"] = turnBidWay(entity.entity_text)
			
@@ -2218,18 +2443,18 @@ def getOtherAttributes(list_entity):
 
				             dict_other["moneysource"] = entity.entity_text
			
 
				         elif entity.entity_type=='serviceTime':
			
 
				             dict_other["serviceTime"] = entity.entity_text
			
 
				-        elif entity.entity_type == 'time' and entity.label==1:
			
 
				-            if entity.values[entity.label]>0.6:
			
 
				-                dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
			
 
				-            # dict_other["time_release"] = timeFormat(entity.entity_text)
			
 
				-        elif entity.entity_type == 'time' and entity.label==2:
			
 
				-            if entity.values[entity.label]>0.6:
			
 
				-                dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
			
 
				-            # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
			
 
				-        elif entity.entity_type == 'time' and entity.label == 3:
			
 
				-            if entity.values[entity.label]>0.6:
			
 
				-                dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
			
 
				-            # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
			
 
				+        # elif entity.entity_type == 'time' and entity.label==1:
			
 
				+        #     if entity.values[entity.label]>0.6:
			
 
				+        #         dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
			
 
				+        #     # dict_other["time_release"] = timeFormat(entity.entity_text)
			
 
				+        # elif entity.entity_type == 'time' and entity.label==2:
			
 
				+        #     if entity.values[entity.label]>0.6:
			
 
				+        #         dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
			
 
				+        #     # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
			
 
				+        # elif entity.entity_type == 'time' and entity.label == 3:
			
 
				+        #     if entity.values[entity.label]>0.6:
			
 
				+        #         dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
			
 
				+        #     # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
			
 
				         elif entity.entity_type=="person" and entity.label ==4:
			
 
				             dict_other["person_review"].append(entity.entity_text)
			
 
				         elif entity.entity_type=='product':
			
@@ -2238,11 +2463,11 @@ def getOtherAttributes(list_entity):
 
				                 dict_other["total_tendereeMoney"] = float(entity.entity_text)
			
 
				                 dict_other["total_tendereeMoneyUnit"] = entity.money_unit
			
 
				     # 时间类别
			
 
				-    for time_type,value in dict_time.items():
			
 
				-        list_time = dict_time[time_type]
			
 
				-        if list_time:
			
 
				-            list_time.sort(key=lambda x:x[1],reverse=True)
			
 
				-            dict_other[time_type] = list_time[0][0]
			
 
				+    # for time_type,value in dict_time.items():
			
 
				+    #     list_time = dict_time[time_type]
			
 
				+    #     if list_time:
			
 
				+    #         list_time.sort(key=lambda x:x[1],reverse=True)
			
 
				+    #         dict_other[time_type] = list_time[0][0]
			
 
				     dict_other["product"] = list(set(dict_other["product"]))
			
 
				     return dict_other
			
 
				 
			
@@ -2259,7 +2484,7 @@ def getPREMs(list_sentences,list_entitys,list_articles):
 
				     result = []
			
 
				     for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
			
 
				         RoleList = getPackageRoleMoney(list_sentence,list_entity)
			
 
				-        result.append(dict({"prem":RoleList,"docid":list_article.id},**getOtherAttributes(list_entity),
			
 
				+        result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
			
 
				                            **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
			
 
				                               "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
			
 
				                               "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -20,6 +20,7 @@ import tensorflow as tf
 
				 from BiddingKG.dl.product.data_util import decode, process_data
			
 
				 from BiddingKG.dl.interface.Entitys import Entity
			
 
				 from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
			
 
				+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
			
 
				 from bs4 import BeautifulSoup
			
 
				 import copy
			
 
				 import calendar
			
@@ -37,41 +38,47 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
 
				                   "product":{"predictor":None,"Lock":RLock()},
			
 
				                 "product_attrs":{"predictor":None,"Lock":RLock()},
			
 
				                   "channel": {"predictor": None, "Lock": RLock()},
			
 
				-                  "deposit_payment_way": {"predictor": None, "Lock": RLock()}}
			
 
				+                  "deposit_payment_way": {"predictor": None, "Lock": RLock()},
			
 
				+                  "total_unit_money": {"predictor": None, "Lock": RLock()}
			
 
				+                  }
			
 
				 
			
 
				 
			
 
				 def getPredictor(_type):
			
 
				     if _type in dict_predictor:
			
 
				         with dict_predictor[_type]["Lock"]:
			
 
				             if dict_predictor[_type]["predictor"] is None:
			
 
				-                if _type=="codeName":
			
 
				+                if _type == "codeName":
			
 
				                     dict_predictor[_type]["predictor"] = CodeNamePredict()
			
 
				-                if _type=="prem":
			
 
				+                if _type == "prem":
			
 
				                     dict_predictor[_type]["predictor"] = PREMPredict()
			
 
				-                if _type=="epc":
			
 
				+                if _type == "epc":
			
 
				                     dict_predictor[_type]["predictor"] = EPCPredict()
			
 
				-                if _type=="roleRule":
			
 
				+                if _type == "roleRule":
			
 
				                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
			
 
				                 if _type == "roleRuleFinal":
			
 
				                     dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
			
 
				                 if _type=="form":
			
 
				+                if _type == "form":
			
 
				                     dict_predictor[_type]["predictor"] = FormPredictor()
			
 
				-                if _type=="time":
			
 
				+                if _type == "time":
			
 
				                     dict_predictor[_type]["predictor"] = TimePredictor()
			
 
				-                if _type=="punish":
			
 
				+                if _type == "punish":
			
 
				                     dict_predictor[_type]["predictor"] = Punish_Extract()
			
 
				-                if _type=="product":
			
 
				+                if _type == "product":
			
 
				                     dict_predictor[_type]["predictor"] = ProductPredictor()
			
 
				-                if _type=="product_attrs":
			
 
				+                if _type == "product_attrs":
			
 
				                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
			
 
				                 if _type == "channel":
			
 
				                     dict_predictor[_type]["predictor"] = DocChannel()
			
 
				                 if _type == 'deposit_payment_way':
			
 
				                     dict_predictor[_type]["predictor"] = DepositPaymentWay()
			
 
				+                if _type == 'total_unit_money':
			
 
				+                    dict_predictor[_type]["predictor"] = TotalUnitMoney()
			
 
				             return dict_predictor[_type]["predictor"]
			
 
				     raise NameError("no this type of predictor")
			
 
				 
			
 
				-#编号名称模型
			
 
				+
			
 
				+# 编号名称模型
			
 
				 class CodeNamePredict():
			
 
				     
			
 
				     def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
			
@@ -2284,6 +2291,39 @@ class DepositPaymentWay():
 
				         else:
			
 
				             return pay_way
			
 
				 
			
 
				+
			
 
				+# 总价单价提取
			
 
				+class TotalUnitMoney:
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+
			
 
				+    def predict(self, list_sentences, list_entitys):
			
 
				+        for i in range(len(list_entitys)):
			
 
				+            list_entity = list_entitys[i]
			
 
				+
			
 
				+            # 总价单价
			
 
				+            for _entity in list_entity:
			
 
				+                if _entity.entity_type == 'money':
			
 
				+                    word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
			
 
				+                    # 总价在中投标金额中
			
 
				+                    if _entity.label == 1:
			
 
				+                        result = extract_total_money(word_of_sentence,
			
 
				+                                                     _entity.entity_text,
			
 
				+                                                     [_entity.wordOffset_begin, _entity.wordOffset_end])
			
 
				+                        if result:
			
 
				+                            _entity.is_total_money = 1
			
 
				+
			
 
				+                    # 单价在普通金额中
			
 
				+                    else:
			
 
				+                        result = extract_unit_money(word_of_sentence,
			
 
				+                                                    _entity.entity_text,
			
 
				+                                                    [_entity.wordOffset_begin, _entity.wordOffset_end])
			
 
				+                        if result:
			
 
				+                            _entity.is_unit_money = 1
			
 
				+                # print("total_unit_money", _entity.entity_text,
			
 
				+                #       _entity.is_total_money, _entity.is_unit_money)
			
 
				+
			
 
				+
			
 
				 def getSavedModel():
			
 
				     #predictor = FormPredictor()
			
 
				     graph = tf.Graph()
			
@@ -2401,6 +2441,7 @@ def h5_to_graph(sess,graph,h5file):
 
				         print(name,graph.get_tensor_by_name(name),np.shape(value))
			
 
				         sess.run(tf.assign(graph.get_tensor_by_name(name),value))
			
 
				 
			
 
				+
			
 
				 def initialize_uninitialized(sess):
			
 
				     global_vars          = tf.global_variables()
			
 
				     is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
			
@@ -2482,7 +2523,8 @@ def save_role_model():
 
				                                            "input2":model.input[2]},
			
 
				                                    outputs={"outputs":model.output}
			
 
				                                    )
			
 
				-    
			
 
				+
			
 
				+
			
 
				 def save_money_model():
			
 
				     model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
			
 
				     graph = tf.Graph()
			
--- a/BiddingKG/dl/interface/timesplit_model/saved_model.pb
+++ b/BiddingKG/dl/interface/timesplit_model/saved_model.pb
--- a/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/interface/timesplit_model/variables/variables.index
+++ b/BiddingKG/dl/interface/timesplit_model/variables/variables.index
--- a/BiddingKG/dl/money/re_money_total_unit.py
+++ b/BiddingKG/dl/money/re_money_total_unit.py
@@ -0,0 +1,145 @@
 
				+import json
			
 
				+import pandas as pd
			
 
				+import re
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+# 总价
			
 
				+total_money = '(合计.?金额|合.?计|总.?价)'
			
 
				+# 单价
			
 
				+unit_money = '(单价|([0-9.，,]+([（(]?元[)）]?)?/))'
			
 
				+
			
 
				+
			
 
				+def re_standard_total(_str):
			
 
				+    reg_standard = "(?P<value>" + total_money + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    total_money_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            m_dict = m.groupdict()
			
 
				+            m_span = m.span()
			
 
				+            keyword_index = [m_span[0], m_span[1]]
			
 
				+            keyword = m_dict.get("value")
			
 
				+            # total_money_list.append([keyword, keyword_index])
			
 
				+            total_money_list.append([keyword, keyword_index, _str])
			
 
				+
			
 
				+    return total_money_list
			
 
				+
			
 
				+
			
 
				+def re_standard_unit(_str):
			
 
				+    reg_standard = "(?P<value>" + unit_money + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    unit_money_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            m_dict = m.groupdict()
			
 
				+            m_span = m.span()
			
 
				+            keyword_index = [m_span[0], m_span[1]]
			
 
				+            keyword = m_dict.get("value")
			
 
				+            # unit_money_list.append([keyword, keyword_index])
			
 
				+
			
 
				+            # 上下文有招标文件的不算
			
 
				+            if '文件' not in _str:
			
 
				+                unit_money_list.append([keyword, keyword_index, _str])
			
 
				+
			
 
				+    return unit_money_list
			
 
				+
			
 
				+
			
 
				+def re_total(text, money, index):
			
 
				+    # 对已提取的中投标金额的前面文字进行正则
			
 
				+    prefix_threshold = 7
			
 
				+    suffix_threshold = 0
			
 
				+    # if index_threshold < index[0]:
			
 
				+    #     money_text = text[index[0]-index_threshold:index[0]]
			
 
				+    #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
			
 
				+    # else:
			
 
				+    #     money_text = text[:index[0]]
			
 
				+    #     print("total", money, text[:index[1]], money_text)
			
 
				+
			
 
				+    prefix_index = index[0] - prefix_threshold
			
 
				+    suffix_index = index[1] + suffix_threshold
			
 
				+    money_text = text[prefix_index if prefix_index > 0 else 0:
			
 
				+                      suffix_index if suffix_index < len(text) else len(text)]
			
 
				+
			
 
				+    # 查找符合标准形式的 总价
			
 
				+    total_money_list = re_standard_total(money_text)
			
 
				+    return total_money_list
			
 
				+
			
 
				+
			
 
				+def re_unit(text, money, index):
			
 
				+    # 根据逗号隔开
			
 
				+    # texts = text.split("，")
			
 
				+    # for t in texts:
			
 
				+    #     match = re.search(money, t)
			
 
				+    #     if match:
			
 
				+    #         text = t
			
 
				+    #         index = match.span()
			
 
				+    #         break
			
 
				+    #     else:
			
 
				+    #         text = ""
			
 
				+    #         index = (0, 0)
			
 
				+
			
 
				+    # 对已提取的中投标金额的前面文字进行正则
			
 
				+    prefix_threshold = 7
			
 
				+    suffix_threshold = 3
			
 
				+    # if prefix_threshold < index[0]:
			
 
				+    #     money_text = text[index[0]-prefix_threshold:index[0]]
			
 
				+    #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
			
 
				+    # else:
			
 
				+    #     money_text = text[:index[0]]
			
 
				+    #     print("unit", money, text[:index[1]], money_text)
			
 
				+
			
 
				+    prefix_index = index[0] - prefix_threshold
			
 
				+    suffix_index = index[1] + suffix_threshold
			
 
				+    money_text = text[prefix_index if prefix_index > 0 else 0:
			
 
				+                      suffix_index if suffix_index < len(text) else len(text)]
			
 
				+
			
 
				+    # 查找符合标准形式的 单价
			
 
				+    unit_money_list = re_standard_unit(money_text)
			
 
				+    return unit_money_list
			
 
				+
			
 
				+
			
 
				+def extract_total_money(text, money, index):
			
 
				+    result_list = []
			
 
				+    total_money_list = re_total(text, money, index)
			
 
				+    if total_money_list:
			
 
				+        for word, text_index, context in total_money_list:
			
 
				+            d = {"body": word, "begin_index": text_index[0],
			
 
				+                 "end_index": text_index[1], "context": context}
			
 
				+            result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def extract_unit_money(text, money, index):
			
 
				+    result_list = []
			
 
				+    unit_money_list = re_unit(text, money, index)
			
 
				+    if unit_money_list:
			
 
				+        for word, text_index, context in unit_money_list:
			
 
				+            d = {"body": word, "begin_index": text_index[0],
			
 
				+                 "end_index": text_index[1], "context": context}
			
 
				+            result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '往往，20(元)/平方'
			
 
				+    print(extract_unit_money(s, "785.0", [6, 11]))
			
 
				+
			
 
				+
			
 
				+def test_html():
			
 
				+    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_total_money(s))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # extract_bidway(s)
			
 
				+
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
			
 
				+    test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/money/test_re_money_total_unit.py
+++ b/BiddingKG/dl/money/test_re_money_total_unit.py
@@ -0,0 +1,75 @@
 
				+import json
			
 
				+import re
			
 
				+import sys, os
			
 
				+import time
			
 
				+
			
 
				+import pandas as pd
			
 
				+from bs4 import BeautifulSoup
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				+from BiddingKG.dl.interface.extract import predict
			
 
				+
			
 
				+
			
 
				+def bidi_predict(html_str):
			
 
				+    content = html_str
			
 
				+    # content = "<div>总价：1110</div>"
			
 
				+    result_dict = json.loads(predict("1", content))
			
 
				+    return result_dict
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    start_time = time.time()
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    # total money
			
 
				+    predict_list_1 = []
			
 
				+    predict_list_2 = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        # if index >= 1000:
			
 
				+        #     break
			
 
				+
			
 
				+        if index % 50 == 0:
			
 
				+            print("="*30, "Loop", index, "="*30)
			
 
				+
			
 
				+        html_str = row["dochtmlcon"]
			
 
				+        # html_str = df.loc[75, "dochtmlcon"]
			
 
				+        # print(html_str)
			
 
				+
			
 
				+        # 先筛选
			
 
				+        # possible = '((合计.?金额|合.?计|总.?价|单.?价)(（元）)?([:： ]))' \
			
 
				+        #            '|([0-9.，,]+([（(]?元[)）]?)?/)'
			
 
				+        # if not re.search(possible, html_str):
			
 
				+        #     predict_list_1.append(str([]))
			
 
				+        #     predict_list_2.append(str([]))
			
 
				+        #     continue
			
 
				+
			
 
				+        # 先经过模型处理
			
 
				+        result_dict = bidi_predict(html_str)
			
 
				+
			
 
				+        # 获取总价单价
			
 
				+        word_list_1 = result_dict.get("total_money")
			
 
				+        word_list_2 = result_dict.get("unit_money")
			
 
				+
			
 
				+        if word_list_1:
			
 
				+            predict = word_list_1
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict total money", predict)
			
 
				+        predict_list_1.append(str(predict))
			
 
				+
			
 
				+        if word_list_2:
			
 
				+            predict = word_list_2
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict unit money", predict)
			
 
				+        predict_list_2.append(str(predict))
			
 
				+
			
 
				+    predict_df_1 = pd.DataFrame(predict_list_1)
			
 
				+    predict_df_2 = pd.DataFrame(predict_list_2)
			
 
				+    df = pd.concat([df, predict_df_1, predict_df_2], axis=1)
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!", time.time()-start_time)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
			
 
				+    test_csv(path)
			
--- a/BiddingKG/dl/offer_type/re_offer_type.py
+++ b/BiddingKG/dl/offer_type/re_offer_type.py
@@ -0,0 +1,28 @@
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+# 报价类型为总价报价
			
 
				+# 报价类型： 闭口价
			
 
				+# 报价类型：国内含税价/人民币
			
 
				+# 报价类型：国内含税价;人民币
			
 
				+# 报价类型： 浮动价
			
 
				+# 报价类型 含税含运费
			
 
				+# 报价类型 单个商品报价
			
 
				+# 报价类型：单个标的报单价
			
 
				+# 报价类型：多个标的报总价，
			
 
				+# 报价类型：不含税（到厂）
			
 
				+# 报价类型： 金额
			
 
				+# 报价类型 含税含运费
			
 
				+# 报价类型：单个标的报单价
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 报价类型：
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/product/data_util.py
+++ b/BiddingKG/dl/product/data_util.py
@@ -4,16 +4,130 @@
 
				 # @Time    : 2021/1/13 0013 14:19
			
 
				 import re
			
 
				 import math
			
 
				+import json
			
 
				 import random
			
 
				 import numpy as np
			
 
				+import pandas as pd
			
 
				 from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode
			
 
				 
			
 
				-id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
			
 
				+tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
			
 
				+id_to_tag = {v:k for k,v in tag2index.items()}
			
 
				+# id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
			
 
				 word_model = getModel_word()
			
 
				+
			
 
				 vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
			
 
				 word2id = {k: v for v, k in enumerate(vocab)}
			
 
				 max_id = len(vocab)
			
 
				 
			
 
				+# vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
			
 
				+# matrix = np.zeros((len(vocab), 60))
			
 
				+# for i in range(1, len(vocab)-1):
			
 
				+#     matrix[i] = word_model[vocab[i]]
			
 
				+# max_id = len(vocab)
			
 
				+# word2id = {k: v for v, k in enumerate(vocab)}
			
 
				+
			
 
				+def df2data(df):
			
 
				+    import pandas as pd
			
 
				+    import json
			
 
				+    datas = []
			
 
				+    for idx in df.index:
			
 
				+        docid = df.loc[idx, 'docid']
			
 
				+        text = df.loc[idx, 'text']
			
 
				+        # string = list(text)
			
 
				+        tags = [0]*len(text)
			
 
				+        labels = json.loads(df.loc[idx, 'label'])
			
 
				+        for label in labels:
			
 
				+            _, _, begin, end, _ = re.split('\s',label)
			
 
				+            begin = int(begin)
			
 
				+            end = int(end)
			
 
				+            if end-begin>=2:
			
 
				+                tags[begin]=1
			
 
				+                tags[end-1]=3
			
 
				+                for i in range(begin+1,end-1):
			
 
				+                    tags[i]=2
			
 
				+        # datas.append([string, tags])
			
 
				+        text_sentence = []
			
 
				+        ids_sentence = []
			
 
				+        tag_sentence = []
			
 
				+        for i in range(len(text)):
			
 
				+            text_sentence.append(text[i])
			
 
				+            ids_sentence.append(word2id.get(text[i], max_id))
			
 
				+            tag_sentence.append(tags[i])
			
 
				+            if text[i] in ['。','！']:
			
 
				+                if text_sentence:
			
 
				+                    # if len(text_sentence) > 100:
			
 
				+                    if len(text_sentence)>5 and len(text_sentence)<1000:
			
 
				+                        datas.append([text_sentence, ids_sentence,tag_sentence])
			
 
				+                    else:
			
 
				+                        print('单句小于5或大于1000，句子长度为：%d,文章ID：%s'%(len(text_sentence), docid))
			
 
				+                    text_sentence = []
			
 
				+                    ids_sentence = []
			
 
				+                    tag_sentence = []
			
 
				+        if text_sentence:
			
 
				+            # if len(text_sentence) > 5:
			
 
				+            if len(text_sentence) > 5 and len(text_sentence) < 1000:
			
 
				+                datas.append([text_sentence, ids_sentence, tag_sentence])
			
 
				+            else:
			
 
				+                print('单句小于5或大于1000，句子长度为：%d,文章ID：%s' % (len(text_sentence), docid))
			
 
				+    return datas
			
 
				+
			
 
				+def find_kw_from_text(kw, s):
			
 
				+    '''
			
 
				+    输入关键词及句子信息，返回句子中关键词的所有出现位置
			
 
				+    :param kw: 关键词
			
 
				+    :param s: 文本
			
 
				+    :return:
			
 
				+    '''
			
 
				+    begin = s.find(kw, 0)
			
 
				+    kws = []
			
 
				+    while begin!=-1:
			
 
				+        end = begin + len(kw)
			
 
				+        # print(s[begin:end])
			
 
				+        kws.append((begin, end))
			
 
				+        begin = s.find(kw, end)
			
 
				+    return kws
			
 
				+
			
 
				+def get_feature(text, lbs):
			
 
				+    '''
			
 
				+    输入文章预处理后文本内容及产品名称列表，返回句子列表，数字化句子列表，数字化标签列表
			
 
				+    :param text: 文本内容
			
 
				+    :param lbs: 产品名称列表
			
 
				+    :return:
			
 
				+    '''
			
 
				+    lbs = sorted(set(lbs), key=lambda x: len(x), reverse=True)
			
 
				+    sentences = []
			
 
				+    ids_list = []
			
 
				+    tags_list = []
			
 
				+    for sentence in text.split('。'):
			
 
				+        if len(sentence) < 5:
			
 
				+            continue
			
 
				+        if len(sentence) > 1000:
			
 
				+            sentence = sentence[:1000]
			
 
				+        tags = [0] * len(sentence)
			
 
				+        ids = [word2id.get(word, max_id) for word in sentence]
			
 
				+        for lb in lbs:
			
 
				+            kw_indexs = find_kw_from_text(lb, sentence)
			
 
				+            for indexs in kw_indexs:
			
 
				+                b, e = indexs
			
 
				+                if tags[b] == 0 and tags[e - 1] == 0:
			
 
				+                    tags[b] = 1
			
 
				+                    tags[e - 1] = 3
			
 
				+                    for i in range(b+1, e - 1):
			
 
				+                        tags[i] = 2
			
 
				+        sentences.append(list(sentence))
			
 
				+        ids_list.append(ids)
			
 
				+        tags_list.append(tags)
			
 
				+    return sentences, ids_list, tags_list
			
 
				+
			
 
				+def dfsearchlb(df):
			
 
				+    datas = []
			
 
				+    for i in df.index:
			
 
				+        text = df.loc[i, 'text']
			
 
				+        lbs = json.loads(df.loc[i, 'lbset'])
			
 
				+        sentences, ids_list, tags_list = get_feature(text, lbs)
			
 
				+        for sen, ids, tags in zip(sentences, ids_list, tags_list):
			
 
				+            datas.append([sen, ids, tags])
			
 
				+    return datas
			
 
				 
			
 
				 def get_label_data():
			
 
				     import psycopg2
			
@@ -112,12 +226,16 @@ def result_to_json(line, tags):
 
				     result = []
			
 
				     ner = []
			
 
				     tags = ''.join([str(it) for it in tags])
			
 
				-    for it in re.finditer("12*3", tags):
			
 
				+    # for it in re.finditer("12*3", tags):
			
 
				+    #     start = it.start()
			
 
				+    #     end = it.end()
			
 
				+    #     ner.append([line[start:end], (start, end)])
			
 
				+    for it in re.finditer("45*6", tags):
			
 
				         start = it.start()
			
 
				         end = it.end()
			
 
				         ner.append([line[start:end], (start, end)])
			
 
				     result.append([line, ner])
			
 
				-    print(tags)
			
 
				+    # print(tags)
			
 
				     return result
			
 
				 
			
 
				 
			
@@ -153,4 +271,45 @@ class BatchManager(object):
 
				         if shuffle:
			
 
				             random.shuffle(self.batch_data)
			
 
				         for idx in range(self.len_data):
			
 
				-            yield self.batch_data[idx]
			
 
				+            yield self.batch_data[idx]
			
 
				+
			
 
				+def 获取原始标注数据():
			
 
				+    import psycopg2
			
 
				+    import json
			
 
				+    conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.103')
			
 
				+    cursor = conn.cursor()
			
 
				+    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 ;"
			
 
				+    cursor.execute(sql)
			
 
				+    writer = open('label_data.txt', 'w', encoding='utf-8')
			
 
				+    datas = []
			
 
				+    for row in cursor.fetchall():
			
 
				+        docid = row[0]
			
 
				+        text = row[1]
			
 
				+        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
			
 
				+        cursor.execute(sql_lb)
			
 
				+        rows = cursor.fetchall()
			
 
				+        print('len(rows)', len(rows))
			
 
				+        datas.append((docid, text, json.dumps(rows, ensure_ascii=False), len(rows)))
			
 
				+    df = pd.DataFrame(datas, columns=['docid', 'text', 'rows', 'product_num'])
			
 
				+    df.to_excel('data/产品数据自己人标注的原始数据.xlsx')
			
 
				+
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    # import os
			
 
				+    import pickle
			
 
				+    # with open('data/dev_data2.pkl', 'rb') as f:
			
 
				+    #     dev_data = pickle.load(f)
			
 
				+    # print(len(dev_data))
			
 
				+    # print(os.path.exists('data/testdata.xlsx'))
			
 
				+    # df = pd.read_excel('data/testdata.xlsx')
			
 
				+    # print(len(df))
			
 
				+    # data_test = df2data(df)
			
 
				+    # print(len(data_test), len(data_test[0][0]))
			
 
				+    # 获取原始标注数据()
			
 
				+    df = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')
			
 
				+    with open('data/dev_data2.pkl', 'rb') as f:
			
 
				+        dev_data = pickle.load(f)
			
 
				+    print(len(set(df['docid'])))
			
 
				+    print('')
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/product/main.py
+++ b/BiddingKG/dl/product/main.py
@@ -3,38 +3,54 @@
 
				 # @Author  : bidikeji
			
 
				 # @Time    : 2021/1/13 0013 14:03 
			
 
				 from BiddingKG.dl.product.product_model import Product_Model
			
 
				-from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json
			
 
				+from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json, df2data,dfsearchlb
			
 
				+from BiddingKG.dl.product.data_process import data_precess
			
 
				 import numpy as np
			
 
				+import pandas as pd
			
 
				 import tensorflow as tf
			
 
				 import random
			
 
				 import pickle
			
 
				 import os
			
 
				+import glob
			
 
				+os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
			
 
				 
			
 
				 def train():
			
 
				     # all_data = get_label_data()
			
 
				     # random.shuffle(all_data)
			
 
				     # train_data = all_data[:int(len(all_data)*0.85)]
			
 
				     # dev_data = all_data[int(len(all_data)*0.85):]
			
 
				-    # with open('data/train_data2.pkl', 'wb') as f:
			
 
				-    #     pickle.dump(train_data, f)
			
 
				-    # with open('data/dev_data2.pkl', 'wb') as f:
			
 
				-    #     pickle.dump(dev_data, f)
			
 
				 
			
 
				-    with open('data/train_data2.pkl', 'rb') as f:
			
 
				-        train_data = pickle.load(f)
			
 
				-    with open('data/dev_data2.pkl', 'rb') as f:
			
 
				-        dev_data = pickle.load(f)
			
 
				+    # df = pd.read_excel('data/所有产品标注数据筛选20211125.xlsx')
			
 
				+    # df.reset_index(drop=True, inplace=True)
			
 
				+    # np.random.seed(8)
			
 
				+    # shuffle_ids = np.random.permutation(len(df))
			
 
				+    # split_ids = int(len(df)*0.1)
			
 
				+    # train_ids = shuffle_ids[split_ids:]
			
 
				+    # dev_ids = shuffle_ids[:int(split_ids/2)]
			
 
				+    # df_train = df.iloc[train_ids]
			
 
				+    # df_dev = df.iloc[dev_ids]
			
 
				+    # train_data = df2data(df_train)
			
 
				+    # dev_data = df2data(df_dev)
			
 
				 
			
 
				-    train_manager = BatchManager(train_data, batch_size=128)
			
 
				-    dev_manager = BatchManager(dev_data, batch_size=64)
			
 
				+    # with open(os.path.dirname(__file__)+'/data/train_data2021-11-30.pkl', 'rb') as f:
			
 
				+    #     train_data = pickle.load(f)
			
 
				+    # with open(os.path.dirname(__file__)+'data/dev_data2021-11-30.pkl', 'rb') as f:
			
 
				+    #     dev_data = pickle.load(f)
			
 
				 
			
 
				-    tf_config = tf.ConfigProto()
			
 
				-    tf_config.gpu_options.allow_growth = True
			
 
				+    train_data, dev_data = data_precess()
			
 
				+
			
 
				+    train_manager = BatchManager(train_data, batch_size=256)
			
 
				+    dev_manager = BatchManager(dev_data, batch_size=256)
			
 
				+
			
 
				+    # tf_config = tf.ConfigProto()
			
 
				+    # tf_config.gpu_options.allow_growth = True
			
 
				+    tf_config = tf.ConfigProto(device_count={'gpu': 1})
			
 
				     steps_per_epoch = train_manager.len_data
			
 
				-    ckpt_path = "model"
			
 
				+    ckpt_path = os.path.dirname(__file__)+'/'+"model"
			
 
				     with tf.Session(config=tf_config) as sess:
			
 
				         model = Product_Model()
			
 
				         sess.run(tf.global_variables_initializer())
			
 
				+        model.saver.restore(sess, os.path.join(ckpt_path, "ner2.ckpt"))
			
 
				         # ckpt = tf.train.get_checkpoint_state(ckpt_path)
			
 
				         # if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
			
 
				         #     model.saver.restore(sess, ckpt.model_checkpoint_path)
			
@@ -44,7 +60,7 @@ def train():
 
				         loss = []
			
 
				         mix_loss = 1000
			
 
				         max_f1 = 0
			
 
				-        for i in range(100):
			
 
				+        for i in range(20):
			
 
				             print('epochs:',i)
			
 
				             # model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
			
 
				             # break
			
@@ -53,20 +69,21 @@ def train():
 
				                 # step, batch_loss = model.run_step(sess, True, batch)
			
 
				                 step, batch_loss = model.run_step(sess, 'train', batch)
			
 
				                 loss.append(batch_loss)
			
 
				-                if step % 10 == 0:
			
 
				+                if step % 1000 == 0:
			
 
				                     iteration = step // steps_per_epoch + 1
			
 
				                     print('iter:{} step:{} loss:{}'.format(iteration, step, np.mean(loss)))
			
 
				-            if i >= 50 or i%5==0:
			
 
				+            if i >= 2 or i%5==0:
			
 
				                 f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
			
 
				                 print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))
			
 
				-                if max_f1 < f1:
			
 
				-                    model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt"))
			
 
				-                    print("model save .bast f1 is %.4f" % f1)
			
 
				+                # if max_f1 < f1:
			
 
				+                #     model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt"))
			
 
				+                #     print("model save .bast f1 is %.4f" % f1)
			
 
				+                #     max_f1 = f1
			
 
				+                if evl_loss<mix_loss and max_f1 < f1:
			
 
				+                    mix_loss = evl_loss
			
 
				                     max_f1 = f1
			
 
				-                    # if np.mean(loss)<mix_loss:
			
 
				-                    #     mix_loss = np.mean(loss)
			
 
				-                    #     model.saver.save(sess, os.path.join(ckpt_path, "ner.ckpt"))
			
 
				-                    #     print("model saved, loss is:",mix_loss)
			
 
				+                    model.saver.save(sess, os.path.join(ckpt_path, "ner1202_find_lb.ckpt")) #ner1130_find_lb.ckpt
			
 
				+                    print("model saved, val_loss is:",mix_loss)
			
 
				                 loss = []
			
 
				 
			
 
				 def evaluate_line():
			
@@ -74,15 +91,22 @@ def evaluate_line():
 
				     with tf.Session() as sess:
			
 
				         model = Product_Model()
			
 
				         sess.run(tf.global_variables_initializer())
			
 
				-        ckpt = tf.train.get_checkpoint_state(ckpt_path)
			
 
				-        if ckpt and tf.train.checkpoint_exists(ckpt_path):
			
 
				-            print('模型文件：',ckpt.model_checkpoint_path)
			
 
				-            model.saver.restore(sess, ckpt.model_checkpoint_path)
			
 
				-            print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
			
 
				-            while True:
			
 
				-                line = input("请输入测试句子：")
			
 
				-                result = model.evaluate_line(sess, line)
			
 
				-                print(result)
			
 
				+        # model.saver.restore(sess, 'model/ner1215.ckpt')
			
 
				+        # model.saver.restore(sess, 'model/ner_f10.7039_loss1.2353.ckpt')
			
 
				+        model.saver.restore(sess, 'model/ner_epoch10_f10.6875_loss1.5230.ckpt')
			
 
				+        while True:
			
 
				+            line = input("请输入测试句子：")
			
 
				+            result = model.evaluate_line(sess, line)
			
 
				+            print(result)
			
 
				+        # ckpt = tf.train.get_checkpoint_state(ckpt_path)
			
 
				+        # if ckpt and tf.train.checkpoint_exists(ckpt_path):
			
 
				+        #     print('模型文件：',ckpt.model_checkpoint_path)
			
 
				+        #     model.saver.restore(sess, ckpt.model_checkpoint_path)
			
 
				+        #     print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
			
 
				+        #     while True:
			
 
				+        #         line = input("请输入测试句子：")
			
 
				+        #         result = model.evaluate_line(sess, line)
			
 
				+        #         print(result)
			
 
				 def predict():
			
 
				     pb_path = "model/product.pb"
			
 
				     with tf.Graph().as_default():
			
@@ -111,7 +135,86 @@ def predict():
 
				                     result = result_to_json(line, tags)
			
 
				                     print(result)
			
 
				 
			
 
				+def predict_df():
			
 
				+    ckpt_path = "model"
			
 
				+    import json
			
 
				+    with tf.Session() as sess:
			
 
				+        model = Product_Model()
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        ckpt = tf.train.get_checkpoint_state(ckpt_path)
			
 
				+        # model.saver.restore(sess, 'model/ner2.ckpt')
			
 
				+        # model.saver.restore(sess, 'model/ner1201_find_lb.ckpt')  # f1:0.6972, precision:0.7403, recall:0.6588, evl_loss:1.2983  model saved, val_loss is: 1.32706
			
 
				+        # model.saver.restore(sess, 'model/ner1208_find_lb.ckpt')  # f1:0.7038, precision:0.7634, recall:0.6528, evl_loss:1.3046 model saved, val_loss is: 1.29316
			
 
				+        # model.saver.restore(sess, 'model/ner_f10.7039_loss1.2353.ckpt')  # f1:0.70 ner1215
			
 
				+        model.saver.restore(sess, 'model/ner_epoch4_f10.6952_loss1.2512.ckpt')  # f1:0.70 ner1215
			
 
				+
			
 
				+        print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
			
 
				+        # df = pd.read_excel('../test/data/贵州数据新字段提取信息_predict.xlsx')
			
 
				+        # df = pd.read_excel('../test/data/所有产品标注数据_补充筛选废标原因数据.xlsx')
			
 
				+        # df = pd.read_excel('../test/data/所有产品标注数据筛选_废标_predict.xlsx')
			
 
				+        df = pd.read_excel('../test/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
			
 
				+        # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
			
 
				+        df.reset_index(drop=True, inplace=True)
			
 
				+        rs = []
			
 
				+        for i in df.index:
			
 
				+            line = df.loc[i, 'text']
			
 
				+            pos = df.loc[i, 'pos']
			
 
				+            reason = df.loc[i, 'reasons_label']
			
 
				+            if pos==0 or reason!='[]':
			
 
				+                rs.append('')
			
 
				+                continue
			
 
				+            # if i > 200:
			
 
				+            #     rs.append('')
			
 
				+            #     continue
			
 
				+            # line = df.loc[i, 'process_text']
			
 
				+            result = model.evaluate_line(sess, line)
			
 
				+            print(result[0][1])
			
 
				+            rs.append(json.dumps(result[0][1], ensure_ascii=False))
			
 
				+        # df['pred_new1202'] = pd.Series(rs)
			
 
				+        df['reson_model'] = pd.Series(rs)
			
 
				+        # df.to_excel('../test/data/贵州数据新字段提取信息_predict.xlsx')
			
 
				+        # df.to_excel('../test/data/所有产品标注数据_补充筛选废标原因数据_predict.xlsx')
			
 
				+        # df.to_excel('../test/data/所有产品标注数据筛选_废标_predict.xlsx')
			
 
				+        df.to_excel('../test/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
			
 
				+        # df.to_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     # train()
			
 
				-    # evaluate_line()
			
 
				-    predict()
			
 
				+    evaluate_line()
			
 
				+    # predict()
			
 
				+    # predict_df()
			
 
				+    # import json
			
 
				+    # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
			
 
				+    # old_new = []
			
 
				+    # new_old = []
			
 
				+    # df['old-new'] = df.apply(lambda x:set([str(it) for it in json.loads(x['pred_old'])])-set([str(it) for it in json.loads(x['pred_new'])]), axis=1)
			
 
				+    # df['new-old'] = df.apply(lambda x:set([str(it) for it in json.loads(x['pred_new'])])-set([str(it) for it in json.loads(x['pred_old'])]), axis=1)
			
 
				+    # df['old=new'] = df.apply(lambda x: 1 if x['old-new']==x['new-old'] else 0, axis=1)
			
 
				+    # df.to_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
			
 
				+
			
 
				+
			
 
				+    # with open('data/dev_data2.pkl', 'rb') as f:
			
 
				+    #     dev_data = pickle.load(f)
			
 
				+    # import json
			
 
				+    # df_dev = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')[:]
			
 
				+    # def rows2lb(rows):
			
 
				+    #     rows = json.loads(rows)
			
 
				+    #     rows = list(set([it[0].split()[-1] for it in rows]))
			
 
				+    #     return json.dumps(rows, ensure_ascii=False)
			
 
				+    # df_dev['lbset'] = df_dev['rows'].apply(lambda x:rows2lb(x))
			
 
				+    # dev_data = dfsearchlb(df_dev)
			
 
				+    # dev_manager = BatchManager(dev_data, batch_size=64)
			
 
				+    # # ckpt_path = "model/ner0305.ckpt" #f1:0.7304, precision:0.8092, recall:0.6656, evl_loss:2.2160
			
 
				+    # # ckpt_path = "model/ner0316.ckpt" #f1:0.7220, precision:0.7854, recall:0.6681, evl_loss:2.2921
			
 
				+    # # ckpt_path = "model/ner2.ckpt" # f1:0.8019, precision:0.8541, recall:0.7557, evl_loss:1.6286
			
 
				+    # # ckpt_path = "model/ner1029.ckpt" #f1:0.6374, precision:0.6897, recall:0.5924, evl_loss:2.0840
			
 
				+    # # ckpt_path = "model/ner1129.ckpt" #f1:0.6034, precision:0.6931, recall:0.5343, evl_loss:1.9704
			
 
				+    # ckpt_path = "model/ner1129.ckpt" #f1:0.6034, precision:0.6931, recall:0.5343, evl_loss:1.9704
			
 
				+    # with tf.Session() as sess:
			
 
				+    #     model = Product_Model()
			
 
				+    #     sess.run(tf.global_variables_initializer())
			
 
				+    #     model.saver.restore(sess, ckpt_path)
			
 
				+    #     print("从文件加载原来模型数据",ckpt_path)
			
 
				+    #     f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
			
 
				+    #     print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))
			
 
				+
			
--- a/BiddingKG/dl/product/product_model.py
+++ b/BiddingKG/dl/product/product_model.py
@@ -6,16 +6,17 @@
 
				 from BiddingKG.dl.product.data_util import matrix,vocab,input_from_line,result_to_json,get_ner
			
 
				 import tensorflow as tf
			
 
				 import numpy as np
			
 
				-# from tensorflow.contrib.crf import crf_log_likelihood
			
 
				-# from tensorflow.contrib.crf import viterbi_decode
			
 
				-# from tensorflow.contrib.layers.python.layers import initializers
			
 
				+from tensorflow.contrib.crf import crf_log_likelihood
			
 
				+from tensorflow.contrib.crf import viterbi_decode
			
 
				+from tensorflow.contrib.layers.python.layers import initializers
			
 
				 
			
 
				 # word_model = getModel_word()
			
 
				 class Product_Model(object):
			
 
				     def __init__(self):
			
 
				         self.char_dim = 60
			
 
				-        self.lstm_dim = 128
			
 
				-        self.num_tags = 4
			
 
				+        self.lstm_dim = 120#128 120
			
 
				+        # self.num_tags = 4
			
 
				+        self.num_tags = 7
			
 
				         self.lr = 0.001
			
 
				         self.clip = 5.0
			
 
				         self.dropout_rate = 0.5
			
@@ -34,6 +35,7 @@ class Product_Model(object):
 
				         self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None],name='CharInputs')
			
 
				         self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None],name='Targets')
			
 
				         self.dropout = tf.placeholder(dtype=tf.float32, name='Dropout')
			
 
				+        # self.lengths = tf.placeholder(dtype=tf.int32, shape=[None],name='lengths')
			
 
				 
			
 
				         used = tf.sign(tf.abs(self.char_inputs))
			
 
				         length = tf.reduce_sum(used, reduction_indices=1)
			
@@ -207,14 +209,16 @@ class Product_Model(object):
 
				                 # Recall.append(recall_temp)
			
 
				                 # F1.append(f1_temp)
			
 
				 
			
 
				-                # for char, gold, pred in zip(string, gold, pred):
			
 
				-                #     result.append(" ".join([char, gold, pred]))
			
 
				-                # results.append(result)
			
 
				-                # with open('evaluate_result.txt','w', encoding='utf-8') as f:
			
 
				-                #     for rs in results:
			
 
				-                #         for line in rs:
			
 
				-                #             f.write(line+'\n')
			
 
				-                #         f.write('\n')
			
 
				+                if gold_ner!=pred_ner:
			
 
				+                    for char, gold, pred in zip(string, gold, pred):
			
 
				+                            result.append(" ".join([char, gold, pred]))
			
 
				+                    # print(result)
			
 
				+                    results.append(result)
			
 
				+                with open('evaluate_result.txt','w', encoding='utf-8') as f:
			
 
				+                    for rs in results:
			
 
				+                        for line in rs:
			
 
				+                            f.write(line+'\n')
			
 
				+                        f.write('\n')
			
 
				 
			
 
				         # return sum(F1)/len(F1),sum(Precision)/len(Precision),sum(Recall)/len(Recall)
			
 
				         precision = equal_num/(pred_num+1e-10)
			
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -0,0 +1,68 @@
 
				+import re
			
 
				+
			
 
				+ratio = '([（(]?(上浮|下浮)(率|)(报价|)([(（]?%[）)]?|)[)）]?[：: ，]{0,3}[0-9]+.?[0-9]*[(（]?%?[）)]?)'
			
 
				+
			
 
				+# 基准利率上浮率）：大写：百分之叁拾点零零，小写：30.00%，
			
 
				+# 基准利率上浮率：百分之三十（30%）
			
 
				+# 租金上浮率
			
 
				+# 上浮率活期20%
			
 
				+# 上浮率：活期20%、一年定期35%
			
 
				+# 下浮率报价0.5%
			
 
				+
			
 
				+
			
 
				+def re_standard_ratio(_str):
			
 
				+    reg_standard = "(?P<value>" + ratio + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    ratio_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            m_dict = m.groupdict()
			
 
				+            m_span = m.span()
			
 
				+            keyword_index = [m_span[0], m_span[1]]
			
 
				+            keyword = m_dict.get("value")
			
 
				+            ratio_list.append([keyword, keyword_index])
			
 
				+
			
 
				+    return ratio_list
			
 
				+
			
 
				+
			
 
				+def re_ratio(text):
			
 
				+    # 查找符合标准形式的 总价
			
 
				+    ratio_list = re_standard_ratio(text)
			
 
				+    return ratio_list
			
 
				+
			
 
				+
			
 
				+def extract_ratio(text):
			
 
				+    result_list = []
			
 
				+    total_money_list = re_ratio(text)
			
 
				+    if total_money_list:
			
 
				+        for word, text_index in total_money_list:
			
 
				+            d = {"body": word, "begin_index": text_index[0],
			
 
				+                 "end_index": text_index[1]}
			
 
				+            result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '年利率较基准利率的上浮率（%）： 30 活期存款下浮率：0.455% 协定存的下浮率，（1-下浮率）' \
			
 
				+        ' 上浮率....  上浮率30（%）  (下浮率%):43  下浮率报价0.5%'
			
 
				+    print(extract_ratio(s))
			
 
				+
			
 
				+
			
 
				+def test_html():
			
 
				+    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_ratio(s))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # extract_bidway(s)
			
 
				+
			
 
				+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
			
 
				+    test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/ratio/test_re_ratio.py
+++ b/BiddingKG/dl/ratio/test_re_ratio.py
@@ -0,0 +1,62 @@
 
				+import json
			
 
				+import sys, os
			
 
				+import time
			
 
				+import pandas as pd
			
 
				+sys.path.append(os.path.abspath("../../.."))
			
 
				+print("sys.path[-1]", sys.path[-1])
			
 
				+from BiddingKG.dl.interface.extract import predict
			
 
				+
			
 
				+
			
 
				+def bidi_predict(html_str):
			
 
				+    content = html_str
			
 
				+    result_dict = json.loads(predict("1", content))
			
 
				+    return result_dict
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    start_time = time.time()
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    # ratio, total_money, unit_money
			
 
				+    predict_list_1 = []
			
 
				+    predict_list_2 = []
			
 
				+    predict_list_3 = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        # if index >= 1000:
			
 
				+        #     break
			
 
				+
			
 
				+        if index % 50 == 0:
			
 
				+            print("="*30, "Loop", index, time.time()-start_time, "="*30)
			
 
				+
			
 
				+        html_str = row["dochtmlcon"]
			
 
				+
			
 
				+        # 先经过模型处理
			
 
				+        result_dict = bidi_predict(html_str)
			
 
				+
			
 
				+        # 获取比率总价单价
			
 
				+        word_list_1 = result_dict.get("total_money")
			
 
				+        word_list_2 = result_dict.get("unit_money")
			
 
				+        word_list_3 = result_dict.get("ratio")
			
 
				+
			
 
				+        # print("predict ratio", word_list_3)
			
 
				+        predict_list_3.append(str(word_list_3))
			
 
				+
			
 
				+        # print("predict total money", word_list_1)
			
 
				+        predict_list_1.append(str(word_list_1))
			
 
				+
			
 
				+        # print("predict unit money", word_list_2)
			
 
				+        predict_list_2.append(str(word_list_2))
			
 
				+
			
 
				+    predict_df_1 = pd.DataFrame(predict_list_1)
			
 
				+    predict_df_2 = pd.DataFrame(predict_list_2)
			
 
				+    predict_df_3 = pd.DataFrame(predict_list_3)
			
 
				+    df = pd.concat([df, predict_df_3, predict_df_1, predict_df_2], axis=1)
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!", time.time()-start_time)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
			
 
				+    path = '比率_result.csv'
			
 
				+    # path = '总价单价_result.csv'
			
 
				+    test_csv(path)
			
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -147,6 +147,8 @@ def predict(doc_id,text):
 
				                     # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
			
 
				                     pass
			
 
				                 # print(entity.pointer_pack)
			
 
				+            # elif entity.entity_type =='serviceTime':
			
 
				+            #     print(entity.entity_text)
			
 
				             #     if entity.pointer_pack:
			
 
				             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
			
 
				             # elif entity.entity_type in ['package']:
			
@@ -439,8 +441,8 @@ if __name__=="__main__":
 
				     a = time.time()
			
 
				     print("start")
			
 
				     # print(predict("12",content))
			
 
				-    # result = predict("12",text)
			
 
				-    result = predict("12",content)
			
 
				+    result = predict("12",text)
			
 
				+    # result = predict("12",content)
			
 
				     # print(json.loads(result))
			
 
				     #test("12",text)
			
 
				     print("takes",time.time()-a)
			
--- a/BiddingKG/dl/time/model_time_classify.weights
+++ b/BiddingKG/dl/time/model_time_classify.weights
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py
--- a/BiddingKG/dl/time/train_2.py
+++ b/BiddingKG/dl/time/train_2.py
@@ -13,10 +13,32 @@ from BiddingKG.dl.common.models import *
 
				 from sklearn.metrics import classification_report
			
 
				 from sklearn.utils import shuffle,class_weight
			
 
				 import matplotlib.pyplot as plt
			
 
				+import random
			
 
				 
			
 
				 input_shape = (2,30,60)
			
 
				 input_shape2 = (2,40,128)
			
 
				-output_shape = [4]
			
 
				+# output_shape = [4]
			
 
				+
			
 
				+time_label_dict = {
			
 
				+             'time': 0,
			
 
				+            'time_release': 1, #发布时间
			
 
				+            'time_bidopen': 2, #开标时间
			
 
				+            'time_bidclose': 3, #截标时间
			
 
				+            'time_bidstart': 12, #投标（开始）时间、响应文件接收（开始）时间
			
 
				+
			
 
				+            'time_publicityStart': 4, #公示开始时间（公示时间、公示期）
			
 
				+            'time_publicityEnd': 5, #公示截止时间
			
 
				+            'time_getFileStart': 6, #文件获取开始时间（文件获取时间）
			
 
				+            'time_getFileEnd': 7, #文件获取截止时间
			
 
				+            'time_registrationStart': 8, #报名开始时间（报名时间）
			
 
				+            'time_registrationEnd': 9, #报名截止时间
			
 
				+            'time_earnestMoneyStart': 10, #保证金递交开始时间（保证金递交时间）
			
 
				+            'time_earnestMoneyEnd': 11, #保证金递交截止时间
			
 
				+            'time_commencement': 13, #开工日期
			
 
				+            'time_completion': 14 #竣工日期
			
 
				+        }
			
 
				+output_shape = [len(time_label_dict)]
			
 
				+
			
 
				 
			
 
				 def get_data():
			
 
				     data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
			
@@ -91,16 +113,23 @@ def getModel2():
 
				     R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
			
 
				     R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
			
 
				 
			
 
				-    L_input_drop = Dropout(0.2)(L_input)
			
 
				-    R_input_drop = Dropout(0.2)(R_input)
			
 
				+    L_input_drop = Dropout(0.3)(L_input)
			
 
				+    R_input_drop = Dropout(0.3)(R_input)
			
 
				     # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
			
 
				     L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
			
 
				     L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
			
 
				     # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
			
 
				     R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
			
 
				     R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
			
 
				+    L_R = layers.merge([L_lstm, R_lstm],concat_axis=1, mode='concat')
			
 
				+    L_R_mask = layers.merge([L_mask, R_mask],concat_axis=1, mode='concat')
			
 
				+    L_R_att = Attention02()(L_R,mask=K.squeeze(L_R_mask,axis=-1))
			
 
				+
			
 
				+    L_att = layers.add([L_att,L_R_att])
			
 
				+    R_att = layers.add([R_att,L_R_att])
			
 
				     concat = layers.merge([L_att, R_att], mode='concat')
			
 
				-    concat = Dropout(0.3)(concat)
			
 
				+
			
 
				+    concat = Dropout(0.2)(concat)
			
 
				     output = layers.Dense(output_shape[0],activation="softmax")(concat)
			
 
				 
			
 
				     model = models.Model(inputs=[L_input,R_input], outputs=output)
			
@@ -111,6 +140,36 @@ def getModel2():
 
				                   metrics=[precision,recall,f1_score])
			
 
				     model.summary()
			
 
				     return model
			
 
				+# def getModel2():
			
 
				+#     '''
			
 
				+#     @summary: 时间分类模型
			
 
				+#     '''
			
 
				+#     L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
			
 
				+#     L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
			
 
				+#     R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
			
 
				+#     R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
			
 
				+#
			
 
				+#     L_input_drop = Dropout(0.3)(L_input)
			
 
				+#     R_input_drop = Dropout(0.3)(R_input)
			
 
				+#     # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
			
 
				+#     L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
			
 
				+#     L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
			
 
				+#     # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
			
 
				+#     R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
			
 
				+#     R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
			
 
				+#     concat = layers.merge([L_att, R_att], mode='concat')
			
 
				+#
			
 
				+#     concat = Dropout(0.2)(concat)
			
 
				+#     output = layers.Dense(output_shape[0],activation="softmax")(concat)
			
 
				+#
			
 
				+#     model = models.Model(inputs=[L_input,R_input], outputs=output)
			
 
				+#
			
 
				+#     learn_rate = 0.00005
			
 
				+#     model.compile(optimizer=optimizers.Adam(lr=learn_rate),
			
 
				+#                   loss=losses.binary_crossentropy,
			
 
				+#                   metrics=[precision,recall,f1_score])
			
 
				+#     model.summary()
			
 
				+#     return model
			
 
				 
			
 
				 def getModel3():
			
 
				     '''
			
@@ -121,8 +180,8 @@ def getModel3():
 
				     R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
			
 
				     R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
			
 
				 
			
 
				-    L_input_drop = Dropout(0.2)(L_input)
			
 
				-    R_input_drop = Dropout(0.2)(R_input)
			
 
				+    L_input_drop = Dropout(0.3)(L_input)
			
 
				+    R_input_drop = Dropout(0.3)(R_input)
			
 
				     # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
			
 
				     L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
			
 
				     # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
			
@@ -133,7 +192,7 @@ def getModel3():
 
				     att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
			
 
				     # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
			
 
				     # concat = layers.merge([L_att, R_att], mode='concat')
			
 
				-    att = Dropout(0.3)(att)
			
 
				+    att = Dropout(0.2)(att)
			
 
				     output = layers.Dense(output_shape[0],activation="softmax")(att)
			
 
				 
			
 
				     model = models.Model(inputs=[L_input,R_input], outputs=output)
			
@@ -145,6 +204,72 @@ def getModel3():
 
				     model.summary()
			
 
				     return model
			
 
				 
			
 
				+class Attention(Layer):
			
 
				+    """多头注意力机制
			
 
				+    """
			
 
				+    def __init__(self, nb_head, size_per_head, **kwargs):
			
 
				+        self.nb_head = nb_head
			
 
				+        self.size_per_head = size_per_head
			
 
				+        self.out_dim = nb_head * size_per_head
			
 
				+        super(Attention, self).__init__(**kwargs)
			
 
				+    def build(self, input_shape):
			
 
				+        super(Attention, self).build(input_shape)
			
 
				+        q_in_dim = input_shape[0][-1]
			
 
				+        k_in_dim = input_shape[1][-1]
			
 
				+        v_in_dim = input_shape[2][-1]
			
 
				+        self.q_kernel = self.add_weight(name='q_kernel',
			
 
				+                                        shape=(q_in_dim, self.out_dim),
			
 
				+                                        initializer='glorot_normal')
			
 
				+        self.k_kernel = self.add_weight(name='k_kernel',
			
 
				+                                        shape=(k_in_dim, self.out_dim),
			
 
				+                                        initializer='glorot_normal')
			
 
				+        self.v_kernel = self.add_weight(name='w_kernel',
			
 
				+                                        shape=(v_in_dim, self.out_dim),
			
 
				+                                        initializer='glorot_normal')
			
 
				+    def mask(self, x, mask, mode='mul'):
			
 
				+        if mask is None:
			
 
				+            return x
			
 
				+        else:
			
 
				+            for _ in range(K.ndim(x) - K.ndim(mask)):
			
 
				+                mask = K.expand_dims(mask, K.ndim(mask))
			
 
				+            if mode == 'mul':
			
 
				+                return x * mask
			
 
				+            else:
			
 
				+                return x - (1 - mask) * 1e10
			
 
				+    def call(self, inputs):
			
 
				+        q, k, v = inputs[:3]
			
 
				+        v_mask, q_mask = None, None
			
 
				+        if len(inputs) > 3:
			
 
				+            v_mask = inputs[3]
			
 
				+            if len(inputs) > 4:
			
 
				+                q_mask = inputs[4]
			
 
				+        # 线性变换
			
 
				+        qw = K.dot(q, self.q_kernel)
			
 
				+        kw = K.dot(k, self.k_kernel)
			
 
				+        vw = K.dot(v, self.v_kernel)
			
 
				+        # 形状变换
			
 
				+        qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
			
 
				+        kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
			
 
				+        vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
			
 
				+        # 维度置换
			
 
				+        qw = K.permute_dimensions(qw, (0, 2, 1, 3))
			
 
				+        kw = K.permute_dimensions(kw, (0, 2, 1, 3))
			
 
				+        vw = K.permute_dimensions(vw, (0, 2, 1, 3))
			
 
				+        # Attention
			
 
				+        a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
			
 
				+        a = K.permute_dimensions(a, (0, 3, 2, 1))
			
 
				+        a = self.mask(a, v_mask, 'add')
			
 
				+        a = K.permute_dimensions(a, (0, 3, 2, 1))
			
 
				+        a = K.softmax(a)
			
 
				+        # 完成输出
			
 
				+        o = K.batch_dot(a, vw, [3, 2])
			
 
				+        o = K.permute_dimensions(o, (0, 2, 1, 3))
			
 
				+        o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
			
 
				+        o = self.mask(o, q_mask, 'mul')
			
 
				+        return o
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return (input_shape[0][0], input_shape[0][1], self.out_dim)
			
 
				+
			
 
				 class Attention02(Layer):
			
 
				     def __init__(self, **kwargs):
			
 
				         self.init = initializers.get('normal')
			
@@ -530,11 +655,216 @@ def train3():
 
				     # # y_pre2 = load_model.predict(train_x[0])
			
 
				     # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
			
 
				     # print(res2)
			
 
				+
			
 
				+def train4():
			
 
				+    # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
			
 
				+    data_load = pd.read_excel("tokens_tolabel_data1_res13New.xlsx", index_col=0)
			
 
				+    # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
			
 
				+    # data_load = data_load[data_load['pre_label_prob']>0.97]
			
 
				+    # data_load = data_load[data_load['is_same']==1]
			
 
				+    data_zero = pd.read_excel("time_entity5.xlsx")
			
 
				+    data_zero = data_zero[(data_zero['viewed']==1)|(data_zero['is_same']==2)]
			
 
				+    # data_old = pd.read_excel("tokens_data_02.xlsx")
			
 
				+    data_old = pd.read_excel("tokens_data_02_res7New.xlsx")
			
 
				+    data_delay1 = pd.read_excel("delayTime_entity1.xlsx")
			
 
				+    data_delay1 = data_delay1[data_delay1['label']!=0]
			
 
				+    data_delay2 = pd.read_excel("delayTime_entity2.xlsx")
			
 
				+
			
 
				+    # data_zero = pd.concat([data_zero,data_zero])
			
 
				+    # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
			
 
				+    # data_zero = data_zero.sample(n=80000)
			
 
				+    print("输入shape：",input_shape2)
			
 
				+    data_x = []
			
 
				+    data_y = []
			
 
				+    import random
			
 
				+    for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
			
 
				+        # if label==_label:
			
 
				+
			
 
				+        y = np.zeros(output_shape)
			
 
				+        y[label] = 1
			
 
				+        left = eval(left)
			
 
				+        left = left[-40:]
			
 
				+        right = eval(right)
			
 
				+        right = right[:40]
			
 
				+        context = [left, right]
			
 
				+        # x = embedding(context, shape=input_shape2)
			
 
				+        data_x.append(context)
			
 
				+        data_y.append(y)
			
 
				+    # data_load2 = data_load[data_load['re_label']==0]
			
 
				+    # for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
			
 
				+    #         if label==_label:
			
 
				+    #             y = np.zeros(output_shape)
			
 
				+    #             y[label] = 1
			
 
				+    #             left = eval(left)
			
 
				+    #             left = left[-40:]
			
 
				+    #             if len(left)>30:
			
 
				+    #                 left = left[2:]
			
 
				+    #             elif len(left)>15:
			
 
				+    #                 left = left[1:]
			
 
				+    #             right = eval(right)
			
 
				+    #             right = right[:40]
			
 
				+    #             if len(right)>15:
			
 
				+    #                 right = right[:-1]
			
 
				+    #             context = [left, right]
			
 
				+    #             # x = embedding(context, shape=input_shape2)
			
 
				+    #             data_x.append(context)
			
 
				+    #             data_y.append(y)
			
 
				+
			
 
				+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['re_label']):
			
 
				+
			
 
				+        y = np.zeros(output_shape)
			
 
				+        y[label] = 1
			
 
				+        left = eval(left)
			
 
				+        left = left[-40:]
			
 
				+        right = eval(right)
			
 
				+        right = right[:40]
			
 
				+        context = [left, right]
			
 
				+        # x = embedding(context, shape=input_shape2)
			
 
				+        data_x.append(context)
			
 
				+        data_y.append(y)
			
 
				+
			
 
				+    for left, right, label in zip(data_delay1['context_left'], data_delay1['context_right'], data_delay1['label']):
			
 
				+            y = np.zeros(output_shape)
			
 
				+            y[label] = 1
			
 
				+            left = eval(left)
			
 
				+            left = left[-40:]
			
 
				+            right = eval(right)
			
 
				+            right = right[:40]
			
 
				+            context = [left, right]
			
 
				+            # x = embedding(context, shape=input_shape2)
			
 
				+            data_x.append(context)
			
 
				+            data_y.append(y)
			
 
				+    for left, right, label in zip(data_delay2['context_left'], data_delay2['context_right'], data_delay2['re_label']):
			
 
				+                y = np.zeros(output_shape)
			
 
				+                y[label] = 1
			
 
				+                left = eval(left)
			
 
				+                left = left[-40:]
			
 
				+                right = eval(right)
			
 
				+                right = right[:40]
			
 
				+                context = [left, right]
			
 
				+                # x = embedding(context, shape=input_shape2)
			
 
				+                data_x.append(context)
			
 
				+                data_y.append(y)
			
 
				+
			
 
				+    # for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
			
 
				+    #         y = np.zeros(output_shape)
			
 
				+    #         y[label] = 1
			
 
				+    #         left = eval(left)
			
 
				+    #         left = left[-40:]
			
 
				+    #         if len(left) > 30:
			
 
				+    #             left = left[2:]
			
 
				+    #         elif len(left) > 15:
			
 
				+    #             left = left[1:]
			
 
				+    #         right = eval(right)
			
 
				+    #         right = right[:40]
			
 
				+    #         if len(right) > 15:
			
 
				+    #             right = right[:-1]
			
 
				+    #         context = [left, right]
			
 
				+    #         # x = embedding(context, shape=input_shape2)
			
 
				+    #         data_x.append(context)
			
 
				+    #         data_y.append(y)
			
 
				+
			
 
				+    # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
			
 
				+    #         y = np.zeros(output_shape)
			
 
				+    #         y[label] = 1
			
 
				+    #         left = eval(left)
			
 
				+    #         left = left[-40:]
			
 
				+    #         right = eval(right)
			
 
				+    #         right = right[:40]
			
 
				+    #         context = [left, right]
			
 
				+    #         # x = embedding(context, shape=input_shape2)
			
 
				+    #         data_x.append(context)
			
 
				+    #         data_y.append(y)
			
 
				+    for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
			
 
				+                                                    data_old['pre_label'],data_old['is_same']):
			
 
				+        if label==0:
			
 
				+            if is_same==1:
			
 
				+                pass
			
 
				+            else:
			
 
				+                if pre_label>3:
			
 
				+                    label = pre_label
			
 
				+                else:
			
 
				+                    continue
			
 
				+        y = np.zeros(output_shape)
			
 
				+        y[label] = 1
			
 
				+        left = eval(left)
			
 
				+        left = left[-40:]
			
 
				+        right = eval(right)
			
 
				+        right = right[:40]
			
 
				+        context = [left, right]
			
 
				+        # x = embedding(context, shape=input_shape2)
			
 
				+        data_x.append(context)
			
 
				+        data_y.append(y)
			
 
				+
			
 
				+    _data = [d for d in zip(data_x,data_y)]
			
 
				+    random.shuffle(_data)
			
 
				+    data_x = [i[0] for i in _data]
			
 
				+    data_y = [i[1] for i in _data]
			
 
				+    test_len = int(len(data_x) * 0.11)
			
 
				+    test_x = data_x[:test_len]
			
 
				+    test_y = data_y[:test_len]
			
 
				+    print("测试数据量：", len(test_x))
			
 
				+    train_x = data_x[test_len:]
			
 
				+    train_y = data_y[test_len:]
			
 
				+
			
 
				+    # for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
			
 
				+    #                                                 data_old['pre_label'],data_old['is_same']):
			
 
				+    #     # if label==0:
			
 
				+    #     #     if random.random()>0.25:
			
 
				+    #     #         continue
			
 
				+    #     if label==0:
			
 
				+    #         if is_same==1:
			
 
				+    #             pass
			
 
				+    #         else:
			
 
				+    #             if pre_label>3:
			
 
				+    #                 label = pre_label
			
 
				+    #             else:
			
 
				+    #                 continue
			
 
				+    #     y = np.zeros(output_shape)
			
 
				+    #     y[label] = 1
			
 
				+    #     left = eval(left)
			
 
				+    #     left = left[-40:]
			
 
				+    #     right = eval(right)
			
 
				+    #     right = right[:40]
			
 
				+    #     context = [left, right]
			
 
				+    #     # x = embedding(context, shape=input_shape2)
			
 
				+    #     train_x.append(context)
			
 
				+    #     train_y.append(y)
			
 
				+    print("训练数据量：", len(train_x))
			
 
				+
			
 
				+    # train_y, test_y = np.array(train_y), np.array(test_y)
			
 
				+    # train_x = np.array(train_x)
			
 
				+    # test_x = np.array(test_x)
			
 
				+    # test_x = np.transpose(test_x, (1, 0, 2, 3))
			
 
				+    # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
			
 
				+    training_generator = DataGenerator(train_x, train_y,is_train=True)
			
 
				+    # training_generator = DataGenerator(data_x, data_y)
			
 
				+    validation_generator = DataGenerator(test_x, test_y,is_train=False,shuffle=False)
			
 
				+
			
 
				+    # model = getModel3()
			
 
				+    model = getModel2()
			
 
				+    epochs = 100
			
 
				+    # batch_size = 256
			
 
				+    checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
			
 
				+                                 save_best_only=True, mode='min')
			
 
				+    # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
			
 
				+    #                                  save_best_only=True, mode='min')
			
 
				+
			
 
				+    history = model.fit_generator(
			
 
				+        generator=training_generator,
			
 
				+        validation_data=validation_generator,
			
 
				+        use_multiprocessing=True, workers=2,
			
 
				+        epochs=epochs,
			
 
				+        shuffle=True,
			
 
				+        callbacks=[checkpoint],
			
 
				+        class_weight='auto'
			
 
				+    )
			
 
				+
			
 
				 from keras.utils import Sequence,to_categorical
			
 
				 class DataGenerator(Sequence):
			
 
				     'Generates data for Keras'
			
 
				-    def __init__(self, texts, labels, batch_size=256,
			
 
				-                 n_classes=4, shuffle=True):
			
 
				+    def __init__(self, texts, labels, is_train=True,batch_size=256,
			
 
				+                 n_classes=len(time_label_dict), shuffle=True):
			
 
				         'Initialization'
			
 
				         # self.dim = dim
			
 
				         self.batch_size = batch_size
			
@@ -542,6 +872,7 @@ class DataGenerator(Sequence):
 
				         self.texts = texts
			
 
				         self.n_classes = n_classes
			
 
				         self.shuffle = shuffle
			
 
				+        self.is_train = is_train
			
 
				         self.on_epoch_end()
			
 
				 
			
 
				     def __len__(self):
			
@@ -583,8 +914,22 @@ class DataGenerator(Sequence):
 
				         # Generate data
			
 
				         for i, context in enumerate(list_texts):
			
 
				             # Store sample
			
 
				-            # tokens = preprocess2(text)
			
 
				-            # tokens = tokens[:maxlen]
			
 
				+            if self.is_train:
			
 
				+                left = context[0]
			
 
				+                if len(left) > 30:
			
 
				+                    if random.random() > 0.5:
			
 
				+                        left = left[2:]
			
 
				+                elif len(left) > 15:
			
 
				+                    if random.random() > 0.5:
			
 
				+                        left = left[1:]
			
 
				+                right = context[1]
			
 
				+                if len(right) > 30:
			
 
				+                    if random.random() > 0.5:
			
 
				+                        right = right[:-2]
			
 
				+                elif len(right) > 15:
			
 
				+                    if random.random() > 0.5:
			
 
				+                        right = right[:-1]
			
 
				+                context = [left, right]
			
 
				             words_matrix = embedding_mywords(context, shape=input_shape2)
			
 
				             # Store class
			
 
				             # y[i] = _label[i]
			
@@ -647,7 +992,11 @@ def predict3():
 
				     new_data.to_excel("new_tokens_data1_res.xlsx")
			
 
				 
			
 
				 def predict4():
			
 
				-    data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
			
 
				+    data = pd.read_csv("tokens_data_02_res6New.csv", chunksize=3000)
			
 
				+    # data = pd.read_excel("C:\\Users\\Administrator\\Desktop\\time_entity4.xlsx")
			
 
				+    # data.to_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv")
			
 
				+    # data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv", chunksize=3000)
			
 
				+
			
 
				     model1 = getModel2()
			
 
				     model1.load_weights("model_time_classify.weights")
			
 
				     new_data = pd.DataFrame()
			
@@ -671,14 +1020,15 @@ def predict4():
 
				         pre_y = model1.predict([test_x[0], test_x[1]])
			
 
				         _data['pre_label'] = [np.argmax(item) for item in pre_y]
			
 
				         _data['pre_label_prob'] = [max(item) for item in pre_y]
			
 
				-        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
			
 
				+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre_label'])]
			
 
				         # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
			
 
				         # data['label'] = label
			
 
				         new_data = pd.concat([new_data, _data])
			
 
				         idx += 3000
			
 
				         print(idx)
			
 
				-    # data.to_csv("new_tokens_data1.csv")
			
 
				-    new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
			
 
				+    # new_data.to_csv("tokens_data_02_res7New.csv")
			
 
				+    new_data.to_excel("tokens_data_02_res7New.xlsx")
			
 
				+    # new_data.to_excel("C:\\Users\\Administrator\\Desktop\\tokens_data_02_res7New.xlsx")
			
 
				 
			
 
				 
			
 
				 def predict():
			
@@ -863,7 +1213,7 @@ def save_model():
 
				             test_model = getModel2()
			
 
				             test_model.load_weights("model_time_classify.weights")
			
 
				             tf.saved_model.simple_save(sess,
			
 
				-                                       "models/timesplit_model/",
			
 
				+                                       "models/timesplit_model2/",
			
 
				                                        inputs={"input0": test_model.input[0],
			
 
				                                                "input1":test_model.input[1]
			
 
				                                                },
			
@@ -879,6 +1229,7 @@ if __name__ == '__main__':
 
				     # training()
			
 
				     # train2()
			
 
				     # train3()
			
 
				+    # train4()
			
 
				     # data_process()
			
 
				     # data_process2()
			
 
				     # data_process3()