Эх сурвалжийг харах

表头文件更新、纵向项目需求预算时间提取、预处理金额正则优化

lishimin 3 жил өмнө
parent
commit
9c8aab67ef

+ 1 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -1713,7 +1713,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[)\)]?))",
+                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台个只吨斤棵株页亩方条米]*))\s*[)\)]?))",
                                   "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?元)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
                                   "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千]*)[\((]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。

+ 13 - 13
BiddingKG/dl/interface/extract.py

@@ -145,19 +145,19 @@ if __name__=="__main__":
     t1 = time.time()
     text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
     title = '合同公告'
-    # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
-    # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
-    # for i in range(10):
-    #     text = df.loc[i, 'dochtmlcon']
-    #     rs = json.loads(predict('', text, ''))
-    #     print(rs['demand_info'])
-    #     print(rs['product'])
-    #     print(rs['product_attrs'])
-    # print(rs)
-
-    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
-        text = f.read()
-        print(predict('', text, title))
+    df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
+    # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
+    for i in range(30,50,1):
+        text = df.loc[i, 'dochtmlcon']
+        rs = json.loads(predict('', text, ''))
+        print(rs['demand_info'])
+        print(rs['product'])
+        print(rs['product_attrs'])
+    print(rs)
+
+    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
+    #     text = f.read()
+    #     print(predict('', text, title))
 
     # print(predict('',text,title))
     # df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')[:20]

BIN
BiddingKG/dl/interface/header_set.pkl


+ 72 - 33
BiddingKG/dl/interface/predictor.py

@@ -1578,7 +1578,7 @@ class ProductAttributesPredictor():
         for tr in trs:
             tr_line = []
             tds = tr.findChildren(['td', 'th'], recursive=False)
-            if len(tds) < 3:
+            if len(tds) < 2:
                 continue
             for td in tds:
                 td_text = re.sub('\s', '', td.get_text())
@@ -1662,8 +1662,38 @@ class ProductAttributesPredictor():
         except:
             num = 30
         return str(num)
-    def fix_time(self, text):
+    def fix_time(self, text, html, page_time):
         '''输入日期字段返回格式化日期'''
+        if re.search('^\d{1,2}月$', text):
+            m = re.search('^(\d{1,2})月$', text).group(1)
+            if len(m) < 2:
+                m = '0' + m
+            year = re.search('(\d{4})年(.{,12}采购意向)?', html)
+            if year:
+                y = year.group(1)
+                num = self.get_monthlen(y, m)
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s.%s.01" % (y, m)
+                order_end = "%s.%s.%s" % (y, m, num)
+            elif page_time != "":
+                year = re.search('\d{4}', page_time)
+                if year:
+                    y = year.group(0)
+                    num = self.get_monthlen(y, m)
+                    if len(num) < 2:
+                        num = '0' + num
+                    order_begin = "%s.%s.01" % (y, m)
+                    order_end = "%s.%s.%s" % (y, m, num)
+                else:
+                    y = str(datetime.datetime.now().year)
+                    num = self.get_monthlen(y, m)
+                    if len(num) < 2:
+                        num = '0' + num
+                    order_begin = "%s.%s.01" % (y, m)
+                    order_end = "%s.%s.%s" % (y, m, num)
+            return order_begin, order_end
+
         t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
         if t1:
             year = t1.group(1)
@@ -1797,6 +1827,7 @@ class ProductAttributesPredictor():
 
 
         soup = BeautifulSoup(html, 'lxml')
+        flag_yx = True if re.search('采购意向', html) else False
         tables = soup.find_all(['table'])
         headers = []
         headers_demand = []
@@ -1816,10 +1847,47 @@ class ProductAttributesPredictor():
             i = 0
             found_header = False
             header_colnum = 0
+
+            if flag_yx:
+                col0_l = []
+                col1_l = []
+                for tds in inner_table:
+                    if len(tds) == 2:
+                        col0_l.append(re.sub(':', '', tds[0]))
+                        col1_l.append(tds[1])
+                if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
+                    header_list2 = []
+                    product = demand = budget = order_begin = order_end = ""
+                    for i in range(len(col0_l)):
+                        if re.search('项目名称', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            product = col1_l[i]
+                        elif re.search('采购需求|需求概况', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            demand = col1_l[i]
+                        elif re.search('采购预算|预算金额', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            budget = col1_l[i]
+                            if '万元' in col0_l[i] and '万' not in budget:
+                                budget += '万元'
+                            budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
+                            budget = str(getUnifyMoney(budget))
+                        elif re.search('采购时间', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            order_time = col1_l[i].strip()
+                            order_begin, order_end = self.fix_time(order_time, html, page_time)
+                    if product!= "" and demand != "" and budget!="" and order_begin != "":
+                        link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
+                                'order_begin': order_begin, 'order_end': order_end}
+                        if link not in demand_link:
+                            demand_link.append(link)
+                            headers_demand.append('_'.join(header_list2))
+                        continue
+
             while i < (len(inner_table)):
                 tds = inner_table[i]
                 not_empty = [it for it in tds if it != ""]
-                if len(set(not_empty)) < len(not_empty) * 0.5:
+                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
                     i += 1
                     continue
                 product = ""  # 产品
@@ -1899,36 +1967,7 @@ class ProductAttributesPredictor():
                         if id8 != "":
                             if re.search('\w', tds[id8]):
                                 order_time = tds[id8].strip()
-                                if re.search('^\d{1,2}月$', order_time):
-                                    m = re.search('^(\d{1,2})月$', order_time).group(1)
-                                    if len(m) < 2:
-                                        m = '0'+m
-                                    year = re.search('(\d{4})年(.{,12}采购意向)?', html)
-                                    if year:
-                                        y = year.group(1)
-                                        num = self.get_monthlen(y, m)
-                                        if len(num)<2:
-                                            num = '0'+num
-                                        order_begin = "%s.%s.01" % (y, m)
-                                        order_end = "%s.%s.%s" % (y, m, num)
-                                    elif page_time!="":
-                                        year = re.search('\d{4}', page_time)
-                                        if year:
-                                            y = year.group(0)
-                                            num = self.get_monthlen(y, m)
-                                            if len(num) < 2:
-                                                num = '0'+num
-                                            order_begin = "%s.%s.01" % (y, m)
-                                            order_end = "%s.%s.%s" % (y, m, num)
-                                        else:
-                                            y = str(datetime.datetime.now().year)
-                                            num = self.get_monthlen(y, m)
-                                            if len(num) < 2:
-                                                num = '0'+num
-                                            order_begin = "%s.%s.01" % (y, m)
-                                            order_end = "%s.%s.%s" % (y, m, num)
-                                else:
-                                    order_begin, order_end = self.fix_time(order_time)
+                                order_begin, order_end = self.fix_time(order_time, html, page_time)
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "":
                             link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
                                                       'brand': brand[:50], 'specs':specs}