3 سال پیش · 67c7f4f0b1
--- a/BiddingKG/dl/channel/re_channel_103.py
+++ b/BiddingKG/dl/channel/re_channel_103.py
@@ -0,0 +1,219 @@
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+# 各投标人
			
 
				+# 各潜在投标人
			
 
				+# 各潜在投标人：
			
 
				+# 致各招标文件持有者：
			
 
				+# 致各投标人
			
 
				+# 各潜在投标供应商：
			
 
				+
			
 
				+# 修改、澄清(答疑)纪要内容如下: 1、
			
 
				+# 答疑澄清与修改的主要内容：
			
 
				+# 对文件澄清与修改的主要内容
			
 
				+# 澄清、修改内容要点
			
 
				+# 答疑纪要
			
 
				+# 答疑如下
			
 
				+# 招标文件答疑和招标文件修改通知
			
 
				+# 招标文件答疑通知
			
 
				+# 答疑及补遗通知
			
 
				+# 答疑回复如下：
			
 
				+# 现对投标人提出的质疑回复如下：
			
 
				+# 对文件澄清与修改的主要内容 详见招标文件
			
 
				+# 修改的主要内容 详见附件
			
 
				+# 澄清或修改事项：
			
 
				+
			
 
				+# 第1次答疑
			
 
				+# 第1次答疑澄清
			
 
				+
			
 
				+# 答疑补遗文件
			
 
				+# 补遗书澄清文件 答疑澄清
			
 
				+# 质疑1
			
 
				+# 问题
			
 
				+# 答疑文件1
			
 
				+# 具体补遗内容详见附件
			
 
				+# 请问 答
			
 
				+# 问题 回复
			
 
				+# 答疑澄清公告 1：
			
 
				+# 现对招标文件作如下澄清：
			
 
				+# 详见答疑澄清文件
			
 
				+# 详见答疑文件。
			
 
				+
			
 
				+
			
 
				+channel_103 = '(澄清|答疑|补遗|修改)'
			
 
				+channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|：)'
			
 
				+channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
			
 
				+                '(如下|[：:]|详见|点击下载附件|[1一][:：、]|（1）|\\(1\\)|一)'
			
 
				+channel_103_2 = '第(.?)次(答疑|澄清)'
			
 
				+channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
			
 
				+channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[:：]|[一1][:：、]|（1）|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
			
 
				+channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[:：]'
			
 
				+channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
			
 
				+
			
 
				+# 答疑澄清时间
			
 
				+# 对文件澄清与修改的主要内容 无澄清文件
			
 
				+# 对文件澄清与修改的主要内容 无
			
 
				+# 请各投标单位自行下载
			
 
				+not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
			
 
				+
			
 
				+
			
 
				+def re_standard_channel_103(_str):
			
 
				+    channel_103_list = []
			
 
				+
			
 
				+    if not re.search(channel_103, _str):
			
 
				+        print("not")
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_0 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("0", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_1 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("1", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_2 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
			
 
				+            channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("2", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_3 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
			
 
				+            channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("3", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_4 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("4", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    reg_standard = "(?P<value>" + channel_103_5 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword_index = [m_span[0], m_span[1]]
			
 
				+        keyword = m_dict.get('value')
			
 
				+        channel_103_list.append([keyword, keyword_index])
			
 
				+    if channel_103_list:
			
 
				+        print("5", channel_103_list)
			
 
				+        return channel_103_list
			
 
				+
			
 
				+    return channel_103_list
			
 
				+
			
 
				+
			
 
				+def re_not_channel_103(_str):
			
 
				+    match = re.findall(not_channel_103, _str)
			
 
				+    if match:
			
 
				+        for word in match:
			
 
				+            instead = "#" * len(word)
			
 
				+            _str = re.sub(word, instead, _str)
			
 
				+    return _str
			
 
				+
			
 
				+
			
 
				+def re_channel_103(text):
			
 
				+    # 替换易混淆词
			
 
				+    clean_text = re_not_channel_103(text)
			
 
				+
			
 
				+    # 查找符合标准形式的
			
 
				+    channel_103_list = re_standard_channel_103(clean_text)
			
 
				+    return channel_103_list
			
 
				+
			
 
				+
			
 
				+def extract_channel_103(text):
			
 
				+    result_list = []
			
 
				+    channel_103_list = re_channel_103(text)
			
 
				+    if channel_103_list:
			
 
				+        for word, text_index in channel_103_list:
			
 
				+            if word is not None:
			
 
				+                if text_index[1]-text_index[0] != len(word) \
			
 
				+                        or text_index[1]-text_index[0] >= 20:
			
 
				+                    return []
			
 
				+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
			
 
				+                result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    predict_list = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        word_list = re_channel_103(row["doctextcon"], "")
			
 
				+        if word_list:
			
 
				+            predict = word_list
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict", predict)
			
 
				+        predict_list.append(str(predict))
			
 
				+
			
 
				+    predict_df = pd.DataFrame(predict_list)
			
 
				+    df = pd.concat([df, predict_df], axis=1)
			
 
				+
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!")
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '''
			
 
				+    (第1次澄清） 发布时间：2020-11-25 致各招标文件持有者： 招标人──舟山市
			
 
				+    '''
			
 
				+    print(extract_channel_103(s))
			
 
				+
			
 
				+
			
 
				+def test_html():
			
 
				+    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_channel_103(s, title=""))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
			
 
				+    # test_csv(path)
			
 
				+    test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py
+++ b/BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py
@@ -0,0 +1,231 @@
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+# 申请人可以组成联合体报名，联合体的家数最多不能超过两家
			
 
				+# 本项目不接受供应商以联合体方式进行投标。
			
 
				+
			
 
				+bidway = '(参与|)(报价|投标|招标|竞价|报名|参加|资格预审|应答|谈判|磋商|竞标)(登记|)'
			
 
				+
			
 
				+# 是否接收联合体投标： 不接受
			
 
				+# 联合体投标: 不允许
			
 
				+# 是否允许联合体投标登记：是
			
 
				+# （是/否）接受联合体投标：否
			
 
				+# 是否接受联合体投标 不接受
			
 
				+# 是否接受联合体投标:不接受
			
 
				+# 本项目（是/否）接受联合体投标：否
			
 
				+# joint_bidding_prefix_1 = '(不[ ]?|[(（]{0,1}[ ]?[是否不][ ]?[）)]{0,1}|)'
			
 
				+joint_bidding_prefix_1 = "(是否|)"
			
 
				+bidway_1 = bidway
			
 
				+joint_bidding_body_1 = '(允许|接受|接收|)(联合体|独立体或联合体)' + bidway_1
			
 
				+joint_bidding_suffix_1 = '([ :：。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
			
 
				+
			
 
				+# 不接受(接受或不接受)联合体投标
			
 
				+# （否）接受联合体。
			
 
				+# （不）接受联合体投标
			
 
				+# （ 不 ）接受联合体。
			
 
				+# 本项目 不 允许联合体投标。
			
 
				+# （否）接受联合体投标
			
 
				+# 本项目不接受联合体参与投标。
			
 
				+# 本合同包接受联合体投标
			
 
				+# 本项目不接受联合体应答，
			
 
				+# 不接受联合体投标
			
 
				+# 否 接受联合体
			
 
				+# 接受 联合体资格预审
			
 
				+# 接受独立体或联合体报名，联合体的家数最多不能超过两家
			
 
				+joint_bidding_prefix_2 = '(不[ ]?|[(（]{0,1}[ ]?[是否不][ ]?[）)]{0,1}|)'
			
 
				+bidway_2 = "(" + bidway + "|)"
			
 
				+joint_bidding_body_2 = '(允许|接受|接收).?(联合体|独立体或联合体)' + bidway_2
			
 
				+joint_bidding_suffix_2 = '([ :：。]{0,2})(不接受|不接收|不允许|允许|接受|接收|是|否|)'
			
 
				+# joint_bidding_suffix_2 = ""
			
 
				+
			
 
				+# 是否允许联合体 不允许
			
 
				+joint_bidding_prefix_3 = '(是否)'
			
 
				+joint_bidding_body_3 = '(允许|接受|接收).?(联合体|独立体或联合体)'
			
 
				+joint_bidding_suffix_3 = '([ :：。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
			
 
				+
			
 
				+
			
 
				+# 是否接受联合体投标：（ ）是（√ ）否。
			
 
				+
			
 
				+
			
 
				+# 投标人须知前附表规定接受联合体投标的
			
 
				+# 联合体投标的，
			
 
				+# 允许联合体投标的
			
 
				+# 如项目接受联合体投标
			
 
				+# （是/否）接受联合体投标: 是 否
			
 
				+# 招标□接受 ?不接受联合体投标
			
 
				+# 联合体投标：接受；不接受
			
 
				+# （是/否）
			
 
				+# 是 否
			
 
				+# 接受；不接受
			
 
				+# 接受 ?不接受
			
 
				+# (接受或不接受)
			
 
				+# 是否允许联合体： 1 是 0 否
			
 
				+# 允许联合体报名 □是 ■ 否
			
 
				+not_joint_bidding_1 = '(' \
			
 
				+                      '联合体投标的|如项目接受联合体投标' \
			
 
				+                      '|是否允许联合体： 1 是 0 否' \
			
 
				+                      '|联合体参加的|联合体牵头人|联合体牵头方|联合体成员|联合体（牵头人）' \
			
 
				+                      '|联合体各方|联合体协议' \
			
 
				+                      '|允许联合体报名 □是 ■ 否' \
			
 
				+                      ')'
			
 
				+not_joint_bidding_2 = '(' \
			
 
				+                      '[(（]{0,1}.?是.{1,2}否[)）]{0,1}' \
			
 
				+                      '|[(（]{0,1}.?接受.{0,2}不接受[)）]{0,1}' \
			
 
				+                      '|1 是 0 否' \
			
 
				+                      '|.{1}接受.{1,2}不接受' \
			
 
				+                      ')'
			
 
				+
			
 
				+
			
 
				+def re_not_joint_bidding(_str):
			
 
				+    _str = re.sub(not_joint_bidding_1, "", _str)
			
 
				+    _str = re.sub(not_joint_bidding_2, "", _str)
			
 
				+    return _str
			
 
				+
			
 
				+
			
 
				+def re_standard_joint_bidding(_str):
			
 
				+    # 第一种形式
			
 
				+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_1 + ")" \
			
 
				+                   + "(?P<body>" + joint_bidding_body_1 + ")" \
			
 
				+                   + "(?P<suffix>" + joint_bidding_suffix_1 + ")"
			
 
				+    # print("prefix", re.findall(joint_bidding_prefix_1, _str))
			
 
				+    # print("body", re.search(joint_bidding_body_1, _str))
			
 
				+    # print("suffix", re.search(joint_bidding_suffix_1, _str))
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    joint_bidding_list = []
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword = ""
			
 
				+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
			
 
				+        joint_bidding_list.append([keyword, m_span[0], m_span[1]])
			
 
				+    if joint_bidding_list:
			
 
				+        return joint_bidding_list
			
 
				+
			
 
				+    # 第二种形式
			
 
				+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_2 + ")" \
			
 
				+                   + "(?P<body>" + joint_bidding_body_2 + ")" \
			
 
				+                   + "(?P<suffix>" + joint_bidding_suffix_2 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
			
 
				+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
			
 
				+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
			
 
				+    joint_bidding_list = []
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword = ""
			
 
				+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
			
 
				+        # 排除 '是否' 的情况
			
 
				+        if _str[m_span[0]-1:m_span[0]] != "是":
			
 
				+            joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
			
 
				+    if joint_bidding_list:
			
 
				+        return joint_bidding_list
			
 
				+
			
 
				+    # 第三种形式
			
 
				+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_3 + ")" \
			
 
				+                   + "(?P<body>" + joint_bidding_body_3 + ")" \
			
 
				+                   + "(?P<suffix>" + joint_bidding_suffix_3 + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
			
 
				+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
			
 
				+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
			
 
				+    joint_bidding_list = []
			
 
				+    for m in match:
			
 
				+        m_dict = m.groupdict()
			
 
				+        m_span = m.span()
			
 
				+        keyword = ""
			
 
				+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
			
 
				+        joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
			
 
				+    if joint_bidding_list:
			
 
				+        return joint_bidding_list
			
 
				+
			
 
				+    return joint_bidding_list
			
 
				+
			
 
				+
			
 
				+def re_joint_bidding(text):
			
 
				+    # 替换小括号
			
 
				+    text_clean = re.sub("\\(", "（", text)
			
 
				+    text_clean = re.sub("\\)", "）", text_clean)
			
 
				+
			
 
				+    # 替换易混淆词
			
 
				+    text_clean = re_not_joint_bidding(text_clean)
			
 
				+    # print("clean", text_clean)
			
 
				+
			
 
				+    # 查找符合标准形式的
			
 
				+    joint_bidding_list = re_standard_joint_bidding(text_clean)
			
 
				+    return joint_bidding_list
			
 
				+
			
 
				+
			
 
				+def judge_joint_bidding(_list):
			
 
				+    new_list = []
			
 
				+    for l in _list:
			
 
				+        if "否" in l[0] or "不" in l[0]:
			
 
				+            new_list.append(["0" + " " + l[0], l[1]])
			
 
				+        else:
			
 
				+            new_list.append(["1" + " " + l[0], l[1]])
			
 
				+
			
 
				+    return new_list
			
 
				+
			
 
				+
			
 
				+def extract_joint_bidding(text):
			
 
				+    result_list = []
			
 
				+    joint_bidding_list = re_joint_bidding(text)
			
 
				+    joint_bidding_list = judge_joint_bidding(joint_bidding_list)
			
 
				+    if joint_bidding_list:
			
 
				+        for word, text_index in joint_bidding_list:
			
 
				+            if word is not None:
			
 
				+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
			
 
				+                result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    predict_list = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        word_list = re_joint_bidding(row["doctextcon"])
			
 
				+        if word_list:
			
 
				+            predict = word_list
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict", predict)
			
 
				+        predict_list.append(str(predict))
			
 
				+
			
 
				+    predict_df = pd.DataFrame(predict_list)
			
 
				+    df = pd.concat([df, predict_df], axis=1)
			
 
				+
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!")
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    # （不）接受联合体投标
			
 
				+    # 本项目不接受供应商以联合体方式进行投标。
			
 
				+    # （否）接受联合体。
			
 
				+    # 是否接收联合体投标： 不接受
			
 
				+    # 联合体投标: 不允许
			
 
				+    # 是否允许联合体投标登记：是
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '''
			
 
				+测绘服务 是否允许联合体 不允许 行业
			
 
				+    '''
			
 
				+    print(extract_joint_bidding(s))
			
 
				+
			
 
				+
			
 
				+def test_html(_path):
			
 
				+    html_path = _path
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_joint_bidding(s, title=""))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\投标工期_result.csv"
			
 
				+    test_csv(path)
			
 
				+    # test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -4,7 +4,6 @@ Created on 2019年1月4日
 
				 @author: User
			
 
				 '''
			
 
				 import os
			
 
				-
			
 
				 from bs4 import BeautifulSoup, Comment
			
 
				 import copy
			
 
				 import re
			
@@ -24,10 +23,11 @@ import BiddingKG.dl.interface.Preprocessing as Preprocessing
 
				 import BiddingKG.dl.interface.getAttributes as getAttributes
			
 
				 import BiddingKG.dl.complaint.punish_predictor as punish_rule
			
 
				 import json
			
 
				+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
			
 
				+from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				 
			
 
				 
			
 
				-
			
 
				-#自定义jsonEncoder
			
 
				+# 自定义jsonEncoder
			
 
				 class MyEncoder(json.JSONEncoder):
			
 
				     def default(self, obj):
			
 
				         if isinstance(obj, np.ndarray):
			
@@ -41,39 +41,40 @@ class MyEncoder(json.JSONEncoder):
 
				             return obj
			
 
				         return json.JSONEncoder.default(self, obj)
			
 
				 
			
 
				+
			
 
				 def predict(doc_id,text,title="",page_time="",**kwargs):
			
 
				     cost_time = dict()
			
 
				 
			
 
				     start_time = time.time()
			
 
				-    log("start process doc %s"%(str(doc_id)))
			
 
				+    # log("start process doc %s"%(str(doc_id)))
			
 
				     list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
			
 
				-    log("get preprocessed done of doc_id%s"%(doc_id))
			
 
				+    # log("get preprocessed done of doc_id%s"%(doc_id))
			
 
				     cost_time["preprocess"] = round(time.time()-start_time,2)
			
 
				     cost_time.update(_cost_time)
			
 
				 
			
 
				-    #依赖句子顺序
			
 
				+    # 依赖句子顺序
			
 
				     start_time = time.time()
			
 
				     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
			
 
				     cost_time["channel"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
 
				     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
			
 
				-    log("get codename done of doc_id%s"%(doc_id))
			
 
				+    # log("get codename done of doc_id%s"%(doc_id))
			
 
				     cost_time["codename"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
 
				     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
			
 
				-    log("get prem done of doc_id%s"%(doc_id))
			
 
				+    # log("get prem done of doc_id%s"%(doc_id))
			
 
				     cost_time["prem"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
 
				     predictor.getPredictor("product").predict(list_sentences,list_entitys)
			
 
				-    log("get product done of doc_id%s"%(doc_id))
			
 
				+    # log("get product done of doc_id%s"%(doc_id))
			
 
				     cost_time["product"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
 
				     product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
			
 
				-    log("get product attributes done of doc_id%s"%(doc_id))
			
 
				+    # log("get product attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["product_attrs"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
@@ -82,12 +83,12 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				 
			
 
				     start_time = time.time()
			
 
				     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
			
 
				-    log("get epc done of doc_id%s"%(doc_id))
			
 
				+    # log("get epc done of doc_id%s"%(doc_id))
			
 
				     cost_time["person"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
 
				     predictor.getPredictor("time").predict(list_sentences, list_entitys)
			
 
				-    log("get time done of doc_id%s"%(doc_id))
			
 
				+    # log("get time done of doc_id%s"%(doc_id))
			
 
				     cost_time["time"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     # 需在getPredictor("prem")后  getAttributes.getPREMs 前
			
@@ -104,11 +105,46 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				                         _entity.values[1] = 0.51
			
 
				                         _entity.set_Money(1, _entity.values)
			
 
				 
			
 
				-    #依赖句子顺序
			
 
				+    # 2021-12-08新增：提取：总价,单价,比率
			
 
				+    total_money_list = []
			
 
				+    unit_money_list = []
			
 
				+    ratio_list = []
			
 
				+    for i in range(len(list_entitys)):
			
 
				+        list_entity = list_entitys[i]
			
 
				+
			
 
				+        # 总价单价
			
 
				+        for _entity in list_entity:
			
 
				+            if _entity.entity_type == 'money':
			
 
				+                word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
			
 
				+                # 总价在中投标金额中
			
 
				+                if _entity.label == 1:
			
 
				+                    result = extract_total_money(word_of_sentence,
			
 
				+                                                 _entity.entity_text,
			
 
				+                                                 [_entity.wordOffset_begin, _entity.wordOffset_end])
			
 
				+                    if result:
			
 
				+                        total_money_list.append(result)
			
 
				+
			
 
				+                # 单价在普通金额中
			
 
				+                else:
			
 
				+                    result = extract_unit_money(word_of_sentence,
			
 
				+                                                _entity.entity_text,
			
 
				+                                                [_entity.wordOffset_begin, _entity.wordOffset_end])
			
 
				+                    if result:
			
 
				+                        unit_money_list.append(result)
			
 
				+
			
 
				+        # 比率
			
 
				+        all_sentence = ""
			
 
				+        for sentence in list_sentences[i]:
			
 
				+            all_sentence += sentence.sentence_text + "，"
			
 
				+        result = extract_ratio(all_sentence)
			
 
				+        if result:
			
 
				+            ratio_list.append(result)
			
 
				+
			
 
				+    # 依赖句子顺序
			
 
				     start_time = time.time()
			
 
				     entityLink.link_entitys(list_entitys)
			
 
				     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				-    log("get attributes done of doc_id%s"%(doc_id))
			
 
				+    # log("get attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["attrs"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
@@ -121,13 +157,17 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				                 if product in d['project_name']:
			
 
				                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
			
 
				 
			
 
				-    #print(prem)
			
 
				+    # print(prem)
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				     data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
			
 
				     data_res["cost_time"] = cost_time
			
 
				     data_res["success"] = True
			
 
				 
			
 
				+    data_res["total_money"] = total_money_list
			
 
				+    data_res["unit_money"] = unit_money_list
			
 
				+    data_res["ratio"] = ratio_list
			
 
				+
			
 
				     # for _article in list_articles:
			
 
				     #     log(_article.content)
			
 
				     #
			
--- a/BiddingKG/dl/money/re_money_total_unit.py
+++ b/BiddingKG/dl/money/re_money_total_unit.py
@@ -0,0 +1,130 @@
 
				+import json
			
 
				+import pandas as pd
			
 
				+import re
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+# 总价
			
 
				+total_money = '(合计.?金额|合.?计|总.?价)'
			
 
				+# 单价
			
 
				+unit_money = '(单价|([0-9.，,]+([（(]?元[)）]?)?/))'
			
 
				+
			
 
				+
			
 
				+def re_standard_total(_str):
			
 
				+    reg_standard = "(?P<value>" + total_money + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    total_money_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            m_dict = m.groupdict()
			
 
				+            m_span = m.span()
			
 
				+            keyword_index = [m_span[0], m_span[1]]
			
 
				+            keyword = m_dict.get("value")
			
 
				+            # total_money_list.append([keyword, keyword_index])
			
 
				+            total_money_list.append([keyword, keyword_index, _str])
			
 
				+
			
 
				+    return total_money_list
			
 
				+
			
 
				+
			
 
				+def re_standard_unit(_str):
			
 
				+    reg_standard = "(?P<value>" + unit_money + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    unit_money_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            m_dict = m.groupdict()
			
 
				+            m_span = m.span()
			
 
				+            keyword_index = [m_span[0], m_span[1]]
			
 
				+            keyword = m_dict.get("value")
			
 
				+            # unit_money_list.append([keyword, keyword_index])
			
 
				+            unit_money_list.append([keyword, keyword_index, _str])
			
 
				+
			
 
				+    return unit_money_list
			
 
				+
			
 
				+
			
 
				+def re_total(text, money, index):
			
 
				+    # 对已提取的中投标金额的前面文字进行正则
			
 
				+    prefix_threshold = 10
			
 
				+    suffix_threshold = 10
			
 
				+    # if index_threshold < index[0]:
			
 
				+    #     money_text = text[index[0]-index_threshold:index[0]]
			
 
				+    #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
			
 
				+    # else:
			
 
				+    #     money_text = text[:index[0]]
			
 
				+    #     print("total", money, text[:index[1]], money_text)
			
 
				+
			
 
				+    prefix_index = index[0] - prefix_threshold
			
 
				+    suffix_index = index[1] + suffix_threshold
			
 
				+    money_text = text[prefix_index if prefix_index > 0 else 0:
			
 
				+                      suffix_index if suffix_index < len(text) else len(text)]
			
 
				+
			
 
				+    # 查找符合标准形式的 总价
			
 
				+    total_money_list = re_standard_total(money_text)
			
 
				+    return total_money_list
			
 
				+
			
 
				+
			
 
				+def re_unit(text, money, index):
			
 
				+    # 对已提取的中投标金额的前面文字进行正则
			
 
				+    prefix_threshold = 10
			
 
				+    suffix_threshold = 10
			
 
				+    # if prefix_threshold < index[0]:
			
 
				+    #     money_text = text[index[0]-prefix_threshold:index[0]]
			
 
				+    #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
			
 
				+    # else:
			
 
				+    #     money_text = text[:index[0]]
			
 
				+    #     print("unit", money, text[:index[1]], money_text)
			
 
				+
			
 
				+    prefix_index = index[0] - prefix_threshold
			
 
				+    suffix_index = index[1] + suffix_threshold
			
 
				+    money_text = text[prefix_index if prefix_index > 0 else 0:
			
 
				+                      suffix_index if suffix_index < len(text) else len(text)]
			
 
				+
			
 
				+    # 查找符合标准形式的 单价
			
 
				+    unit_money_list = re_standard_unit(money_text)
			
 
				+    return unit_money_list
			
 
				+
			
 
				+
			
 
				+def extract_total_money(text, money, index):
			
 
				+    result_list = []
			
 
				+    total_money_list = re_total(text, money, index)
			
 
				+    if total_money_list:
			
 
				+        for word, text_index, context in total_money_list:
			
 
				+            d = {"body": word, "begin_index": text_index[0],
			
 
				+                 "end_index": text_index[1], "context": context}
			
 
				+            result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def extract_unit_money(text, money, index):
			
 
				+    result_list = []
			
 
				+    unit_money_list = re_unit(text, money, index)
			
 
				+    if unit_money_list:
			
 
				+        for word, text_index, context in unit_money_list:
			
 
				+            d = {"body": word, "begin_index": text_index[0],
			
 
				+                 "end_index": text_index[1], "context": context}
			
 
				+            result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '往往，20(元)/平方'
			
 
				+    print(extract_unit_money(s, "785.0", [6, 11]))
			
 
				+
			
 
				+
			
 
				+def test_html():
			
 
				+    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_total_money(s))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # extract_bidway(s)
			
 
				+
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
			
 
				+    test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/money/test_re_money_total_unit.py
+++ b/BiddingKG/dl/money/test_re_money_total_unit.py
@@ -0,0 +1,75 @@
 
				+import json
			
 
				+import re
			
 
				+import sys, os
			
 
				+import time
			
 
				+
			
 
				+import pandas as pd
			
 
				+from bs4 import BeautifulSoup
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				+from BiddingKG.dl.interface.extract import predict
			
 
				+
			
 
				+
			
 
				+def bidi_predict(html_str):
			
 
				+    content = html_str
			
 
				+    # content = "<div>总价：1110</div>"
			
 
				+    result_dict = json.loads(predict("1", content))
			
 
				+    return result_dict
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    start_time = time.time()
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    # total money
			
 
				+    predict_list_1 = []
			
 
				+    predict_list_2 = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        # if index >= 1000:
			
 
				+        #     break
			
 
				+
			
 
				+        if index % 50 == 0:
			
 
				+            print("="*30, "Loop", index, "="*30)
			
 
				+
			
 
				+        html_str = row["dochtmlcon"]
			
 
				+        # html_str = df.loc[75, "dochtmlcon"]
			
 
				+        # print(html_str)
			
 
				+
			
 
				+        # 先筛选
			
 
				+        # possible = '((合计.?金额|合.?计|总.?价|单.?价)(（元）)?([:： ]))' \
			
 
				+        #            '|([0-9.，,]+([（(]?元[)）]?)?/)'
			
 
				+        # if not re.search(possible, html_str):
			
 
				+        #     predict_list_1.append(str([]))
			
 
				+        #     predict_list_2.append(str([]))
			
 
				+        #     continue
			
 
				+
			
 
				+        # 先经过模型处理
			
 
				+        result_dict = bidi_predict(html_str)
			
 
				+
			
 
				+        # 获取总价单价
			
 
				+        word_list_1 = result_dict.get("total_money")
			
 
				+        word_list_2 = result_dict.get("unit_money")
			
 
				+
			
 
				+        if word_list_1:
			
 
				+            predict = word_list_1
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict total money", predict)
			
 
				+        predict_list_1.append(str(predict))
			
 
				+
			
 
				+        if word_list_2:
			
 
				+            predict = word_list_2
			
 
				+        else:
			
 
				+            predict = []
			
 
				+        print("predict unit money", predict)
			
 
				+        predict_list_2.append(str(predict))
			
 
				+
			
 
				+    predict_df_1 = pd.DataFrame(predict_list_1)
			
 
				+    predict_df_2 = pd.DataFrame(predict_list_2)
			
 
				+    df = pd.concat([df, predict_df_1, predict_df_2], axis=1)
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!", time.time()-start_time)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
			
 
				+    test_csv(path)
			
--- a/BiddingKG/dl/offer_type/re_offer_type.py
+++ b/BiddingKG/dl/offer_type/re_offer_type.py
@@ -0,0 +1,28 @@
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+# 报价类型为总价报价
			
 
				+# 报价类型： 闭口价
			
 
				+# 报价类型：国内含税价/人民币
			
 
				+# 报价类型：国内含税价;人民币
			
 
				+# 报价类型： 浮动价
			
 
				+# 报价类型 含税含运费
			
 
				+# 报价类型 单个商品报价
			
 
				+# 报价类型：单个标的报单价
			
 
				+# 报价类型：多个标的报总价，
			
 
				+# 报价类型：不含税（到厂）
			
 
				+# 报价类型： 金额
			
 
				+# 报价类型 含税含运费
			
 
				+# 报价类型：单个标的报单价
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 报价类型：
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -0,0 +1,60 @@
 
				+import re
			
 
				+
			
 
				+ratio = '((上浮|下浮)(率|).{0,2}[0-9.]+%)'
			
 
				+
			
 
				+
			
 
				+def re_standard_ratio(_str):
			
 
				+    reg_standard = "(?P<value>" + ratio + ")"
			
 
				+    match = re.finditer(reg_standard, _str)
			
 
				+    ratio_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            m_dict = m.groupdict()
			
 
				+            m_span = m.span()
			
 
				+            keyword_index = [m_span[0], m_span[1]]
			
 
				+            keyword = m_dict.get("value")
			
 
				+            ratio_list.append([keyword, keyword_index])
			
 
				+
			
 
				+    return ratio_list
			
 
				+
			
 
				+
			
 
				+def re_ratio(text):
			
 
				+    # 查找符合标准形式的 总价
			
 
				+    ratio_list = re_standard_ratio(text)
			
 
				+    return ratio_list
			
 
				+
			
 
				+
			
 
				+def extract_ratio(text):
			
 
				+    result_list = []
			
 
				+    total_money_list = re_ratio(text)
			
 
				+    if total_money_list:
			
 
				+        for word, text_index in total_money_list:
			
 
				+            d = {"body": word, "begin_index": text_index[0],
			
 
				+                 "end_index": text_index[1]}
			
 
				+            result_list.append(d)
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def test_str():
			
 
				+    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				+    s = '年利率较基准利率的上浮率：30% 活期存款年利率：0.455% 协定存'
			
 
				+    print(extract_ratio(s))
			
 
				+
			
 
				+
			
 
				+def test_html():
			
 
				+    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+
			
 
				+    with open(html_path, "r") as f:
			
 
				+        s = f.read()
			
 
				+
			
 
				+    print(extract_ratio(s))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # extract_bidway(s)
			
 
				+
			
 
				+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
			
 
				+    test_str()
			
 
				+    # test_html(path)
			
 
				+    pass
			
 
				+
			
--- a/BiddingKG/dl/ratio/test_re_ratio.py
+++ b/BiddingKG/dl/ratio/test_re_ratio.py
@@ -0,0 +1,61 @@
 
				+import json
			
 
				+import sys, os
			
 
				+import time
			
 
				+import pandas as pd
			
 
				+sys.path.append(os.path.abspath("../../.."))
			
 
				+print("sys.path[-1]", sys.path[-1])
			
 
				+from BiddingKG.dl.interface.extract import predict
			
 
				+
			
 
				+
			
 
				+def bidi_predict(html_str):
			
 
				+    content = html_str
			
 
				+    result_dict = json.loads(predict("1", content))
			
 
				+    return result_dict
			
 
				+
			
 
				+
			
 
				+def test_csv(_path):
			
 
				+    start_time = time.time()
			
 
				+    df = pd.read_csv(_path)
			
 
				+
			
 
				+    # ratio, total_money, unit_money
			
 
				+    predict_list_1 = []
			
 
				+    predict_list_2 = []
			
 
				+    predict_list_3 = []
			
 
				+    for index, row in df.iterrows():
			
 
				+        if index >= 1000:
			
 
				+            break
			
 
				+
			
 
				+        if index % 50 == 0:
			
 
				+            print("="*30, "Loop", index, time.time()-start_time, "="*30)
			
 
				+
			
 
				+        html_str = row["dochtmlcon"]
			
 
				+
			
 
				+        # 先经过模型处理
			
 
				+        result_dict = bidi_predict(html_str)
			
 
				+
			
 
				+        # 获取比率总价单价
			
 
				+        word_list_1 = result_dict.get("total_money")
			
 
				+        word_list_2 = result_dict.get("unit_money")
			
 
				+        word_list_3 = result_dict.get("ratio")
			
 
				+
			
 
				+        # print("predict ratio", word_list_3)
			
 
				+        predict_list_3.append(str(word_list_3))
			
 
				+
			
 
				+        # print("predict total money", word_list_1)
			
 
				+        predict_list_1.append(str(word_list_1))
			
 
				+
			
 
				+        # print("predict unit money", word_list_2)
			
 
				+        predict_list_2.append(str(word_list_2))
			
 
				+
			
 
				+    predict_df_1 = pd.DataFrame(predict_list_1)
			
 
				+    predict_df_2 = pd.DataFrame(predict_list_2)
			
 
				+    predict_df_3 = pd.DataFrame(predict_list_3)
			
 
				+    df = pd.concat([df, predict_df_3, predict_df_1, predict_df_2], axis=1)
			
 
				+    df.to_csv(_path)
			
 
				+    print("finish write!", time.time()-start_time)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
			
 
				+    path = '比率_result.csv'
			
 
				+    test_csv(path)
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -39,7 +39,7 @@ if __name__=="__main__":
 
				     # filename = "比地_52_79929693.html"
			
 
				     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				     # text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				-    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
			
 
				+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\2.html", "r", encoding="utf8").read()
			
 
				     content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				     # df_a = {"html":[]}
			
 
				     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
			
@@ -69,6 +69,7 @@ if __name__=="__main__":
 
				     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				     # print(predict("12", text))
			
 
				     print(predict("12", content))
			
 
				+    print(predict("12", content))
			
 
				     # test("12",text)
			
 
				     # test("12",content)
			
 
				     print("takes",time.time()-_time1)
			
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py