فهرست منبع

提取总价、单价、比率、是否联合体,判断澄清答疑

fangjiasheng 3 سال پیش
والد
کامیت
67c7f4f0b1

+ 219 - 0
BiddingKG/dl/channel/re_channel_103.py

@@ -0,0 +1,219 @@
+import pandas as pd
+import re
+
+# 各投标人
+# 各潜在投标人
+# 各潜在投标人:
+# 致各招标文件持有者:
+# 致各投标人
+# 各潜在投标供应商:
+
+# 修改、澄清(答疑)纪要内容如下: 1、
+# 答疑澄清与修改的主要内容:
+# 对文件澄清与修改的主要内容
+# 澄清、修改内容要点
+# 答疑纪要
+# 答疑如下
+# 招标文件答疑和招标文件修改通知
+# 招标文件答疑通知
+# 答疑及补遗通知
+# 答疑回复如下:
+# 现对投标人提出的质疑回复如下:
+# 对文件澄清与修改的主要内容 详见招标文件
+# 修改的主要内容 详见附件
+# 澄清或修改事项:
+
+# 第1次答疑
+# 第1次答疑澄清
+
+# 答疑补遗文件
+# 补遗书澄清文件 答疑澄清
+# 质疑1
+# 问题
+# 答疑文件1
+# 具体补遗内容详见附件
+# 请问 答
+# 问题 回复
+# 答疑澄清公告 1:
+# 现对招标文件作如下澄清:
+# 详见答疑澄清文件
+# 详见答疑文件。
+
+
+channel_103 = '(澄清|答疑|补遗|修改)'
+channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|:)'
+channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
+                '(如下|[::]|详见|点击下载附件|[1一][::、]|(1)|\\(1\\)|一)'
+channel_103_2 = '第(.?)次(答疑|澄清)'
+channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
+channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[::]|[一1][::、]|(1)|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
+channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[::]'
+channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
+
+# 答疑澄清时间
+# 对文件澄清与修改的主要内容 无澄清文件
+# 对文件澄清与修改的主要内容 无
+# 请各投标单位自行下载
+not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
+
+
+def re_standard_channel_103(_str):
+    channel_103_list = []
+
+    if not re.search(channel_103, _str):
+        print("not")
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_0 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("0", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_1 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("1", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_2 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
+            channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("2", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_3 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
+            channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("3", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_4 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("4", channel_103_list)
+        return channel_103_list
+
+    reg_standard = "(?P<value>" + channel_103_5 + ")"
+    match = re.finditer(reg_standard, _str)
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword_index = [m_span[0], m_span[1]]
+        keyword = m_dict.get('value')
+        channel_103_list.append([keyword, keyword_index])
+    if channel_103_list:
+        print("5", channel_103_list)
+        return channel_103_list
+
+    return channel_103_list
+
+
+def re_not_channel_103(_str):
+    match = re.findall(not_channel_103, _str)
+    if match:
+        for word in match:
+            instead = "#" * len(word)
+            _str = re.sub(word, instead, _str)
+    return _str
+
+
+def re_channel_103(text):
+    # 替换易混淆词
+    clean_text = re_not_channel_103(text)
+
+    # 查找符合标准形式的
+    channel_103_list = re_standard_channel_103(clean_text)
+    return channel_103_list
+
+
+def extract_channel_103(text):
+    result_list = []
+    channel_103_list = re_channel_103(text)
+    if channel_103_list:
+        for word, text_index in channel_103_list:
+            if word is not None:
+                if text_index[1]-text_index[0] != len(word) \
+                        or text_index[1]-text_index[0] >= 20:
+                    return []
+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
+                result_list.append(d)
+    return result_list
+
+
+def test_csv(_path):
+    df = pd.read_csv(_path)
+
+    predict_list = []
+    for index, row in df.iterrows():
+        word_list = re_channel_103(row["doctextcon"], "")
+        if word_list:
+            predict = word_list
+        else:
+            predict = []
+        print("predict", predict)
+        predict_list.append(str(predict))
+
+    predict_df = pd.DataFrame(predict_list)
+    df = pd.concat([df, predict_df], axis=1)
+
+    df.to_csv(_path)
+    print("finish write!")
+
+
+def test_str():
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '''
+    (第1次澄清) 发布时间:2020-11-25 致各招标文件持有者: 招标人──舟山市
+    '''
+    print(extract_channel_103(s))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_channel_103(s, title=""))
+
+
+if __name__ == "__main__":
+    path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
+    # test_csv(path)
+    test_str()
+    # test_html(path)
+    pass
+

+ 231 - 0
BiddingKG/dl/if_joint_bidding/re_if_joint_bidding.py

@@ -0,0 +1,231 @@
+import pandas as pd
+import re
+
+# 申请人可以组成联合体报名,联合体的家数最多不能超过两家
+# 本项目不接受供应商以联合体方式进行投标。
+
+bidway = '(参与|)(报价|投标|招标|竞价|报名|参加|资格预审|应答|谈判|磋商|竞标)(登记|)'
+
+# 是否接收联合体投标: 不接受
+# 联合体投标: 不允许
+# 是否允许联合体投标登记:是
+# (是/否)接受联合体投标:否
+# 是否接受联合体投标 不接受
+# 是否接受联合体投标:不接受
+# 本项目(是/否)接受联合体投标:否
+# joint_bidding_prefix_1 = '(不[ ]?|[((]{0,1}[ ]?[是否不][ ]?[))]{0,1}|)'
+joint_bidding_prefix_1 = "(是否|)"
+bidway_1 = bidway
+joint_bidding_body_1 = '(允许|接受|接收|)(联合体|独立体或联合体)' + bidway_1
+joint_bidding_suffix_1 = '([ ::。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
+
+# 不接受(接受或不接受)联合体投标
+# (否)接受联合体。
+# (不)接受联合体投标
+# ( 不 )接受联合体。
+# 本项目 不 允许联合体投标。
+# (否)接受联合体投标
+# 本项目不接受联合体参与投标。
+# 本合同包接受联合体投标
+# 本项目不接受联合体应答,
+# 不接受联合体投标
+# 否 接受联合体
+# 接受 联合体资格预审
+# 接受独立体或联合体报名,联合体的家数最多不能超过两家
+joint_bidding_prefix_2 = '(不[ ]?|[((]{0,1}[ ]?[是否不][ ]?[))]{0,1}|)'
+bidway_2 = "(" + bidway + "|)"
+joint_bidding_body_2 = '(允许|接受|接收).?(联合体|独立体或联合体)' + bidway_2
+joint_bidding_suffix_2 = '([ ::。]{0,2})(不接受|不接收|不允许|允许|接受|接收|是|否|)'
+# joint_bidding_suffix_2 = ""
+
+# 是否允许联合体 不允许
+joint_bidding_prefix_3 = '(是否)'
+joint_bidding_body_3 = '(允许|接受|接收).?(联合体|独立体或联合体)'
+joint_bidding_suffix_3 = '([ ::。]{1,2})(不接受|不接收|不允许|允许|接受|接收|是|否)'
+
+
+# 是否接受联合体投标:( )是(√ )否。
+
+
+# 投标人须知前附表规定接受联合体投标的
+# 联合体投标的,
+# 允许联合体投标的
+# 如项目接受联合体投标
+# (是/否)接受联合体投标: 是 否
+# 招标□接受 ?不接受联合体投标
+# 联合体投标:接受;不接受
+# (是/否)
+# 是 否
+# 接受;不接受
+# 接受 ?不接受
+# (接受或不接受)
+# 是否允许联合体: 1 是 0 否
+# 允许联合体报名 □是 ■ 否
+not_joint_bidding_1 = '(' \
+                      '联合体投标的|如项目接受联合体投标' \
+                      '|是否允许联合体: 1 是 0 否' \
+                      '|联合体参加的|联合体牵头人|联合体牵头方|联合体成员|联合体(牵头人)' \
+                      '|联合体各方|联合体协议' \
+                      '|允许联合体报名 □是 ■ 否' \
+                      ')'
+not_joint_bidding_2 = '(' \
+                      '[((]{0,1}.?是.{1,2}否[))]{0,1}' \
+                      '|[((]{0,1}.?接受.{0,2}不接受[))]{0,1}' \
+                      '|1 是 0 否' \
+                      '|.{1}接受.{1,2}不接受' \
+                      ')'
+
+
+def re_not_joint_bidding(_str):
+    _str = re.sub(not_joint_bidding_1, "", _str)
+    _str = re.sub(not_joint_bidding_2, "", _str)
+    return _str
+
+
+def re_standard_joint_bidding(_str):
+    # 第一种形式
+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_1 + ")" \
+                   + "(?P<body>" + joint_bidding_body_1 + ")" \
+                   + "(?P<suffix>" + joint_bidding_suffix_1 + ")"
+    # print("prefix", re.findall(joint_bidding_prefix_1, _str))
+    # print("body", re.search(joint_bidding_body_1, _str))
+    # print("suffix", re.search(joint_bidding_suffix_1, _str))
+    match = re.finditer(reg_standard, _str)
+    joint_bidding_list = []
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword = ""
+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
+        joint_bidding_list.append([keyword, m_span[0], m_span[1]])
+    if joint_bidding_list:
+        return joint_bidding_list
+
+    # 第二种形式
+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_2 + ")" \
+                   + "(?P<body>" + joint_bidding_body_2 + ")" \
+                   + "(?P<suffix>" + joint_bidding_suffix_2 + ")"
+    match = re.finditer(reg_standard, _str)
+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
+    joint_bidding_list = []
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword = ""
+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
+        # 排除 '是否' 的情况
+        if _str[m_span[0]-1:m_span[0]] != "是":
+            joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
+    if joint_bidding_list:
+        return joint_bidding_list
+
+    # 第三种形式
+    reg_standard = "(?P<prefix>" + joint_bidding_prefix_3 + ")" \
+                   + "(?P<body>" + joint_bidding_body_3 + ")" \
+                   + "(?P<suffix>" + joint_bidding_suffix_3 + ")"
+    match = re.finditer(reg_standard, _str)
+    # print("prefix", re.findall(joint_bidding_prefix_2, _str))
+    # print("body", re.search(joint_bidding_body_2, "接受 联合体资格预审"))
+    # print("suffix", re.search(joint_bidding_suffix_2, _str))
+    joint_bidding_list = []
+    for m in match:
+        m_dict = m.groupdict()
+        m_span = m.span()
+        keyword = ""
+        keyword += m_dict.get("prefix") + m_dict.get("body") + m_dict.get("suffix")
+        joint_bidding_list.append([keyword, [m_span[0], m_span[1]]])
+    if joint_bidding_list:
+        return joint_bidding_list
+
+    return joint_bidding_list
+
+
+def re_joint_bidding(text):
+    # 替换小括号
+    text_clean = re.sub("\\(", "(", text)
+    text_clean = re.sub("\\)", ")", text_clean)
+
+    # 替换易混淆词
+    text_clean = re_not_joint_bidding(text_clean)
+    # print("clean", text_clean)
+
+    # 查找符合标准形式的
+    joint_bidding_list = re_standard_joint_bidding(text_clean)
+    return joint_bidding_list
+
+
+def judge_joint_bidding(_list):
+    new_list = []
+    for l in _list:
+        if "否" in l[0] or "不" in l[0]:
+            new_list.append(["0" + " " + l[0], l[1]])
+        else:
+            new_list.append(["1" + " " + l[0], l[1]])
+
+    return new_list
+
+
+def extract_joint_bidding(text):
+    result_list = []
+    joint_bidding_list = re_joint_bidding(text)
+    joint_bidding_list = judge_joint_bidding(joint_bidding_list)
+    if joint_bidding_list:
+        for word, text_index in joint_bidding_list:
+            if word is not None:
+                d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
+                result_list.append(d)
+    return result_list
+
+
+def test_csv(_path):
+    df = pd.read_csv(_path)
+
+    predict_list = []
+    for index, row in df.iterrows():
+        word_list = re_joint_bidding(row["doctextcon"])
+        if word_list:
+            predict = word_list
+        else:
+            predict = []
+        print("predict", predict)
+        predict_list.append(str(predict))
+
+    predict_df = pd.DataFrame(predict_list)
+    df = pd.concat([df, predict_df], axis=1)
+
+    df.to_csv(_path)
+    print("finish write!")
+
+
+def test_str():
+    # (不)接受联合体投标
+    # 本项目不接受供应商以联合体方式进行投标。
+    # (否)接受联合体。
+    # 是否接收联合体投标: 不接受
+    # 联合体投标: 不允许
+    # 是否允许联合体投标登记:是
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '''
+测绘服务 是否允许联合体 不允许 行业
+    '''
+    print(extract_joint_bidding(s))
+
+
+def test_html(_path):
+    html_path = _path
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_joint_bidding(s, title=""))
+
+
+if __name__ == "__main__":
+    path = "D:\\BIDI_DOC\\比地_文档\\投标工期_result.csv"
+    test_csv(path)
+    # test_str()
+    # test_html(path)
+    pass
+

+ 55 - 15
BiddingKG/dl/interface/extract.py

@@ -4,7 +4,6 @@ Created on 2019年1月4日
 @author: User
 '''
 import os
-
 from bs4 import BeautifulSoup, Comment
 import copy
 import re
@@ -24,10 +23,11 @@ import BiddingKG.dl.interface.Preprocessing as Preprocessing
 import BiddingKG.dl.interface.getAttributes as getAttributes
 import BiddingKG.dl.complaint.punish_predictor as punish_rule
 import json
+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
+from BiddingKG.dl.ratio.re_ratio import extract_ratio
 
 
-
-#自定义jsonEncoder
+# 自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.ndarray):
@@ -41,39 +41,40 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
+
 def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time = dict()
 
     start_time = time.time()
-    log("start process doc %s"%(str(doc_id)))
+    # log("start process doc %s"%(str(doc_id)))
     list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
-    log("get preprocessed done of doc_id%s"%(doc_id))
+    # log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
-    #依赖句子顺序
+    # 依赖句子顺序
     start_time = time.time()
     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
     cost_time["channel"] = round(time.time()-start_time,2)
 
     start_time = time.time()
     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
-    log("get codename done of doc_id%s"%(doc_id))
+    # log("get codename done of doc_id%s"%(doc_id))
     cost_time["codename"] = round(time.time()-start_time,2)
 
     start_time = time.time()
     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
-    log("get prem done of doc_id%s"%(doc_id))
+    # log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
     start_time = time.time()
     predictor.getPredictor("product").predict(list_sentences,list_entitys)
-    log("get product done of doc_id%s"%(doc_id))
+    # log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
 
     start_time = time.time()
     product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
-    log("get product attributes done of doc_id%s"%(doc_id))
+    # log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
     start_time = time.time()
@@ -82,12 +83,12 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
     start_time = time.time()
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
-    log("get epc done of doc_id%s"%(doc_id))
+    # log("get epc done of doc_id%s"%(doc_id))
     cost_time["person"] = round(time.time()-start_time,2)
 
     start_time = time.time()
     predictor.getPredictor("time").predict(list_sentences, list_entitys)
-    log("get time done of doc_id%s"%(doc_id))
+    # log("get time done of doc_id%s"%(doc_id))
     cost_time["time"] = round(time.time()-start_time,2)
 
     # 需在getPredictor("prem")后  getAttributes.getPREMs 前
@@ -104,11 +105,46 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
                         _entity.values[1] = 0.51
                         _entity.set_Money(1, _entity.values)
 
-    #依赖句子顺序
+    # 2021-12-08新增:提取:总价,单价,比率
+    total_money_list = []
+    unit_money_list = []
+    ratio_list = []
+    for i in range(len(list_entitys)):
+        list_entity = list_entitys[i]
+
+        # 总价单价
+        for _entity in list_entity:
+            if _entity.entity_type == 'money':
+                word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
+                # 总价在中投标金额中
+                if _entity.label == 1:
+                    result = extract_total_money(word_of_sentence,
+                                                 _entity.entity_text,
+                                                 [_entity.wordOffset_begin, _entity.wordOffset_end])
+                    if result:
+                        total_money_list.append(result)
+
+                # 单价在普通金额中
+                else:
+                    result = extract_unit_money(word_of_sentence,
+                                                _entity.entity_text,
+                                                [_entity.wordOffset_begin, _entity.wordOffset_end])
+                    if result:
+                        unit_money_list.append(result)
+
+        # 比率
+        all_sentence = ""
+        for sentence in list_sentences[i]:
+            all_sentence += sentence.sentence_text + ","
+        result = extract_ratio(all_sentence)
+        if result:
+            ratio_list.append(result)
+
+    # 依赖句子顺序
     start_time = time.time()
     entityLink.link_entitys(list_entitys)
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-    log("get attributes done of doc_id%s"%(doc_id))
+    # log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
     start_time = time.time()
@@ -121,13 +157,17 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
                 if product in d['project_name']:
                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
 
-    #print(prem)
+    # print(prem)
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
+    data_res["total_money"] = total_money_list
+    data_res["unit_money"] = unit_money_list
+    data_res["ratio"] = ratio_list
+
     # for _article in list_articles:
     #     log(_article.content)
     #

+ 130 - 0
BiddingKG/dl/money/re_money_total_unit.py

@@ -0,0 +1,130 @@
+import json
+import pandas as pd
+import re
+from bs4 import BeautifulSoup
+
+# 总价
+total_money = '(合计.?金额|合.?计|总.?价)'
+# 单价
+unit_money = '(单价|([0-9.,,]+([((]?元[))]?)?/))'
+
+
+def re_standard_total(_str):
+    reg_standard = "(?P<value>" + total_money + ")"
+    match = re.finditer(reg_standard, _str)
+    total_money_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword_index = [m_span[0], m_span[1]]
+            keyword = m_dict.get("value")
+            # total_money_list.append([keyword, keyword_index])
+            total_money_list.append([keyword, keyword_index, _str])
+
+    return total_money_list
+
+
+def re_standard_unit(_str):
+    reg_standard = "(?P<value>" + unit_money + ")"
+    match = re.finditer(reg_standard, _str)
+    unit_money_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword_index = [m_span[0], m_span[1]]
+            keyword = m_dict.get("value")
+            # unit_money_list.append([keyword, keyword_index])
+            unit_money_list.append([keyword, keyword_index, _str])
+
+    return unit_money_list
+
+
+def re_total(text, money, index):
+    # 对已提取的中投标金额的前面文字进行正则
+    prefix_threshold = 10
+    suffix_threshold = 10
+    # if index_threshold < index[0]:
+    #     money_text = text[index[0]-index_threshold:index[0]]
+    #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
+    # else:
+    #     money_text = text[:index[0]]
+    #     print("total", money, text[:index[1]], money_text)
+
+    prefix_index = index[0] - prefix_threshold
+    suffix_index = index[1] + suffix_threshold
+    money_text = text[prefix_index if prefix_index > 0 else 0:
+                      suffix_index if suffix_index < len(text) else len(text)]
+
+    # 查找符合标准形式的 总价
+    total_money_list = re_standard_total(money_text)
+    return total_money_list
+
+
+def re_unit(text, money, index):
+    # 对已提取的中投标金额的前面文字进行正则
+    prefix_threshold = 10
+    suffix_threshold = 10
+    # if prefix_threshold < index[0]:
+    #     money_text = text[index[0]-prefix_threshold:index[0]]
+    #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
+    # else:
+    #     money_text = text[:index[0]]
+    #     print("unit", money, text[:index[1]], money_text)
+
+    prefix_index = index[0] - prefix_threshold
+    suffix_index = index[1] + suffix_threshold
+    money_text = text[prefix_index if prefix_index > 0 else 0:
+                      suffix_index if suffix_index < len(text) else len(text)]
+
+    # 查找符合标准形式的 单价
+    unit_money_list = re_standard_unit(money_text)
+    return unit_money_list
+
+
+def extract_total_money(text, money, index):
+    result_list = []
+    total_money_list = re_total(text, money, index)
+    if total_money_list:
+        for word, text_index, context in total_money_list:
+            d = {"body": word, "begin_index": text_index[0],
+                 "end_index": text_index[1], "context": context}
+            result_list.append(d)
+    return result_list
+
+
+def extract_unit_money(text, money, index):
+    result_list = []
+    unit_money_list = re_unit(text, money, index)
+    if unit_money_list:
+        for word, text_index, context in unit_money_list:
+            d = {"body": word, "begin_index": text_index[0],
+                 "end_index": text_index[1], "context": context}
+            result_list.append(d)
+    return result_list
+
+
+def test_str():
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '往往,20(元)/平方'
+    print(extract_unit_money(s, "785.0", [6, 11]))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_total_money(s))
+
+
+if __name__ == "__main__":
+    # extract_bidway(s)
+
+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
+    test_str()
+    # test_html(path)
+    pass
+

+ 75 - 0
BiddingKG/dl/money/test_re_money_total_unit.py

@@ -0,0 +1,75 @@
+import json
+import re
+import sys, os
+import time
+
+import pandas as pd
+from bs4 import BeautifulSoup
+sys.path.append(os.path.abspath("../.."))
+from BiddingKG.dl.interface.extract import predict
+
+
+def bidi_predict(html_str):
+    content = html_str
+    # content = "<div>总价:1110</div>"
+    result_dict = json.loads(predict("1", content))
+    return result_dict
+
+
+def test_csv(_path):
+    start_time = time.time()
+    df = pd.read_csv(_path)
+
+    # total money
+    predict_list_1 = []
+    predict_list_2 = []
+    for index, row in df.iterrows():
+        # if index >= 1000:
+        #     break
+
+        if index % 50 == 0:
+            print("="*30, "Loop", index, "="*30)
+
+        html_str = row["dochtmlcon"]
+        # html_str = df.loc[75, "dochtmlcon"]
+        # print(html_str)
+
+        # 先筛选
+        # possible = '((合计.?金额|合.?计|总.?价|单.?价)((元))?([:: ]))' \
+        #            '|([0-9.,,]+([((]?元[))]?)?/)'
+        # if not re.search(possible, html_str):
+        #     predict_list_1.append(str([]))
+        #     predict_list_2.append(str([]))
+        #     continue
+
+        # 先经过模型处理
+        result_dict = bidi_predict(html_str)
+
+        # 获取总价单价
+        word_list_1 = result_dict.get("total_money")
+        word_list_2 = result_dict.get("unit_money")
+
+        if word_list_1:
+            predict = word_list_1
+        else:
+            predict = []
+        print("predict total money", predict)
+        predict_list_1.append(str(predict))
+
+        if word_list_2:
+            predict = word_list_2
+        else:
+            predict = []
+        print("predict unit money", predict)
+        predict_list_2.append(str(predict))
+
+    predict_df_1 = pd.DataFrame(predict_list_1)
+    predict_df_2 = pd.DataFrame(predict_list_2)
+    df = pd.concat([df, predict_df_1, predict_df_2], axis=1)
+    df.to_csv(_path)
+    print("finish write!", time.time()-start_time)
+
+
+if __name__ == "__main__":
+    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
+    test_csv(path)

+ 28 - 0
BiddingKG/dl/offer_type/re_offer_type.py

@@ -0,0 +1,28 @@
+import pandas as pd
+import re
+
+# 报价类型为总价报价
+# 报价类型: 闭口价
+# 报价类型:国内含税价/人民币
+# 报价类型:国内含税价;人民币
+# 报价类型: 浮动价
+# 报价类型 含税含运费
+# 报价类型 单个商品报价
+# 报价类型:单个标的报单价
+# 报价类型:多个标的报总价,
+# 报价类型:不含税(到厂)
+# 报价类型: 金额
+# 报价类型 含税含运费
+# 报价类型:单个标的报单价
+
+
+
+
+
+
+
+
+
+# 报价类型:
+
+

+ 60 - 0
BiddingKG/dl/ratio/re_ratio.py

@@ -0,0 +1,60 @@
+import re
+
+ratio = '((上浮|下浮)(率|).{0,2}[0-9.]+%)'
+
+
+def re_standard_ratio(_str):
+    reg_standard = "(?P<value>" + ratio + ")"
+    match = re.finditer(reg_standard, _str)
+    ratio_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword_index = [m_span[0], m_span[1]]
+            keyword = m_dict.get("value")
+            ratio_list.append([keyword, keyword_index])
+
+    return ratio_list
+
+
+def re_ratio(text):
+    # 查找符合标准形式的 总价
+    ratio_list = re_standard_ratio(text)
+    return ratio_list
+
+
+def extract_ratio(text):
+    result_list = []
+    total_money_list = re_ratio(text)
+    if total_money_list:
+        for word, text_index in total_money_list:
+            d = {"body": word, "begin_index": text_index[0],
+                 "end_index": text_index[1]}
+            result_list.append(d)
+    return result_list
+
+
+def test_str():
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
+    s = '年利率较基准利率的上浮率:30% 活期存款年利率:0.455% 协定存'
+    print(extract_ratio(s))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_ratio(s))
+
+
+if __name__ == "__main__":
+    # extract_bidway(s)
+
+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
+    test_str()
+    # test_html(path)
+    pass
+

+ 61 - 0
BiddingKG/dl/ratio/test_re_ratio.py

@@ -0,0 +1,61 @@
+import json
+import sys, os
+import time
+import pandas as pd
+sys.path.append(os.path.abspath("../../.."))
+print("sys.path[-1]", sys.path[-1])
+from BiddingKG.dl.interface.extract import predict
+
+
+def bidi_predict(html_str):
+    content = html_str
+    result_dict = json.loads(predict("1", content))
+    return result_dict
+
+
+def test_csv(_path):
+    start_time = time.time()
+    df = pd.read_csv(_path)
+
+    # ratio, total_money, unit_money
+    predict_list_1 = []
+    predict_list_2 = []
+    predict_list_3 = []
+    for index, row in df.iterrows():
+        if index >= 1000:
+            break
+
+        if index % 50 == 0:
+            print("="*30, "Loop", index, time.time()-start_time, "="*30)
+
+        html_str = row["dochtmlcon"]
+
+        # 先经过模型处理
+        result_dict = bidi_predict(html_str)
+
+        # 获取比率总价单价
+        word_list_1 = result_dict.get("total_money")
+        word_list_2 = result_dict.get("unit_money")
+        word_list_3 = result_dict.get("ratio")
+
+        # print("predict ratio", word_list_3)
+        predict_list_3.append(str(word_list_3))
+
+        # print("predict total money", word_list_1)
+        predict_list_1.append(str(word_list_1))
+
+        # print("predict unit money", word_list_2)
+        predict_list_2.append(str(word_list_2))
+
+    predict_df_1 = pd.DataFrame(predict_list_1)
+    predict_df_2 = pd.DataFrame(predict_list_2)
+    predict_df_3 = pd.DataFrame(predict_list_3)
+    df = pd.concat([df, predict_df_3, predict_df_1, predict_df_2], axis=1)
+    df.to_csv(_path)
+    print("finish write!", time.time()-start_time)
+
+
+if __name__ == "__main__":
+    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
+    path = '比率_result.csv'
+    test_csv(path)

+ 2 - 1
BiddingKG/dl/test/test4.py

@@ -39,7 +39,7 @@ if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
     # text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\2.html", "r", encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -69,6 +69,7 @@ if __name__=="__main__":
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", text))
     print(predict("12", content))
+    print(predict("12", content))
     # test("12",text)
     # test("12",content)
     print("takes",time.time()-_time1)

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 1 - 7
BiddingKG/dl/time/re_servicetime.py


برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است