Просмотр исходного кода

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION

luojiehua 3 лет назад
Родитель
Сommit
eefeb6ecaf

+ 25 - 10
BiddingKG/dl/interface/Preprocessing.py

@@ -1,4 +1,4 @@
-
+# -*- coding: utf-8 -*-
 
 from bs4 import BeautifulSoup, Comment
 import copy
@@ -752,7 +752,10 @@ def tableToText(soup):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
-                                head += cell["left_head"]
+                                if re.search("单报标限总]价|金额|成交报?价|报价", head):
+                                    head = cell["left_head"] + head
+                                else:
+                                    head += cell["left_head"]
                                 if str(head+cell["text"]) in text_set:
                                     continue
                                 if re.search(packPattern,head) is not None:
@@ -787,7 +790,10 @@ def tableToText(soup):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
-                                head += cell["top_head"]
+                                if re.search("单报标限总]价|金额|成交报?价|报价", head):
+                                    head = cell["top_head"] + head
+                                else:
+                                    head += cell["top_head"]
                                 if str(head+cell["text"]) in text_set:
                                     continue
                                 if re.search(packPattern,head) is not None:
@@ -796,7 +802,8 @@ def tableToText(soup):
                                     #排名替换为同一种表达
                                     rank_text += head+cell["text"]+","
                                     #print(rank_text)
-                                elif re.search(entityPattern,head) is not None:
+                                elif re.search(entityPattern,head) is not None and \
+                                        re.search('业绩|资格|条件',head)==None and re.search('业绩',cell["text"])==None : #2021/10/19 解决包含业绩的行调到前面问题
                                     entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
@@ -986,20 +993,22 @@ def tableToText(soup):
     pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
 
     list_innerTable = []
-    tbodies = soup.find_all('table')
+    tbodies = soup.find_all('tbody')
     # 遍历表格中的每个tbody
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
         tbody = tbodies[len(tbodies)-tbody_index]
         inner_table = trunTable(tbody)
         list_innerTable.append(inner_table)
-    tbodies = soup.find_all('tbody')
+    '''2021/10/19先找tbody 再找table,避免一个table内多个tbody造成数据丢失'''
+    tbodies = soup.find_all('table')
     # 遍历表格中的每个tbody
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
         tbody = tbodies[len(tbodies)-tbody_index]
         inner_table = trunTable(tbody)
         list_innerTable.append(inner_table)
+
     return soup
     # return list_innerTable
 
@@ -1825,7 +1834,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                                 re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
                             unit = '元'
                             # print('明显金额特征补充单位 元')
-                        elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7})$)|(^\d{,3}(,\d{3})+$)',entity_text):
+                        elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)',entity_text):
                             unit = '元'
                             # print('明显金额特征补充单位 元')
                     if unit.find("万") >= 0 and entity_text.find("万") >= 0:  #2021/7/19修改为金额文本有万,不计算单位
@@ -2107,6 +2116,12 @@ if __name__=="__main__":
     '''        
     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
-    getPredictTable()
-    
-        
+    # getPredictTable()
+    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
+        sourceContent = f.read()
+        # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
+        # print(article_processed)
+
+        list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
+        for entity in list_entitys[0]:
+            print(entity.entity_type, entity.entity_text)

+ 23 - 3
BiddingKG/dl/interface/extract.py

@@ -68,7 +68,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time["product"] = round(time.time()-start_time,2)
 
     start_time = time.time()
-    product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text)
+    product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
     log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
@@ -102,10 +102,16 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
 
+    if len(product_attrs[1]['demand_info']['data'])>0:
+        for d in product_attrs[1]['demand_info']['data']:
+            for product in set(prem[0]['product']):
+                if product in d['project_name']:
+                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
+
     #print(prem)
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0])
+    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
@@ -139,7 +145,21 @@ if __name__=="__main__":
     t1 = time.time()
     text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
     title = '合同公告'
-    print(predict('',text,title))
+    # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
+    # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
+    # for i in range(10):
+    #     text = df.loc[i, 'dochtmlcon']
+    #     rs = json.loads(predict('', text, ''))
+    #     print(rs['demand_info'])
+    #     print(rs['product'])
+    #     print(rs['product_attrs'])
+    # print(rs)
+
+    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
+        text = f.read()
+        print(predict('', text, title))
+
+    # print(predict('',text,title))
     # df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')[:20]
     # new_prem = []
     # for i in range(len(df)):

+ 172 - 14
BiddingKG/dl/interface/predictor.py

@@ -22,6 +22,8 @@ from BiddingKG.dl.interface.Entitys import Entity
 from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
 from bs4 import BeautifulSoup
 import copy
+import calendar
+import datetime
 
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
@@ -1532,7 +1534,7 @@ class ProductPredictor():
                     result.append(item) # 修正bug
                 return result
 
-# 产品数量单价品牌规格提取
+# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
 class ProductAttributesPredictor():
     def __init__(self,):
         self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
@@ -1653,6 +1655,67 @@ class ProductAttributesPredictor():
                             elif len(tds1) > 0 and len(tds1) == indtd - 1:
                                 tds1[indtd - 2].insert_after(copy.copy(td))
 
+    def get_monthlen(self, year, month):
+        '''输入年份、月份 int类型 得到该月份天数'''
+        try:
+            weekday, num = calendar.monthrange(int(year), int(month))
+        except:
+            num = 30
+        return str(num)
+    def fix_time(self, text):
+        '''输入日期字段返回格式化日期'''
+        t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
+        if t1:
+            year = t1.group(1)
+            month = t1.group(3)
+            num = self.get_monthlen(year, month)
+            if len(month)<2:
+                month = '0'+month
+            if len(num) < 2:
+                num = '0'+num
+            order_begin = "%s-%s-01" % (year, month)
+            order_end = "%s-%s-%s" % (year, month, num)
+            return order_begin, order_end
+        if  re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)\d{1,2}日?$', text):
+            text = re.sub('年|月|/|-', '-', text)
+            text = text.replace('日', '')
+            order_begin = text
+            order_end = text
+            return order_begin, order_end
+        all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
+                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
+                                '(?:(?P<d2>\d{1,2})日)?)?$', text)
+        y1 = m1 = d1 = y2 = m2 = d2 = ""
+        found_math = False
+        for _match in all_match:
+            if len(_match.group()) > 0:
+                found_math = True
+                for k, v in _match.groupdict().items():
+                    if v!="" and v is not None:
+                        if k == 'y1':
+                            y1 = v
+                        elif k == 'm1':
+                            m1 = v
+                        elif k == 'd1':
+                            d1 = v
+                        elif k == 'y2':
+                            y2 = v
+                        elif k == 'm2':
+                            m2 = v
+                        elif k == 'd2':
+                            d2 = v
+        if not found_math:
+            return "", ""
+        y2 = y1 if y2 == "" else y2
+        d1 = '1' if d1 == "" else d1
+        d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
+        for it in (m1,d1,m2,d2):
+            if len(it)<2:
+                it = '0'+it
+        order_begin = "%s-%s-%s"%(y1,m1,d1)
+        order_end = "%s-%s-%s"%(y2,m2,d2)
+        return order_begin, order_end
+
     def find_header(self, items, p1, p2):
         '''
         inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
@@ -1662,12 +1725,16 @@ class ProductAttributesPredictor():
         :return: 表头所在列序号,是否表头,表头内容
         '''
         flag = False
-        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': ''}
+        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
         product = ""  # 产品
         quantity = ""  # 数量
         unitPrice = ""  # 单价
         brand = ""  # 品牌
         specs = ""  # 规格
+        demand = "" # 采购需求
+        budget = "" # 预算金额
+        order_time = "" # 采购时间
+
         for i in range(min(4, len(items))):
             it = items[i]
             if len(it) < 15 and re.search(p1, it) != None:
@@ -1700,15 +1767,28 @@ class ProductAttributesPredictor():
                 elif re.search('规格', items[j]):
                     header_dic['规格'] = j
                     specs = items[j]
-            if header_dic.get('名称', "") != "" and (header_dic.get('数量', "") != "" or header_dic.get('单价', "") != ""
-                                                   or header_dic.get('品牌', "") != "" or header_dic.get('规格',
-                                                                                                       "") != ""):
-                return header_dic, flag, (product, quantity, unitPrice, brand, specs)
 
+                elif re.search('需求', items[j]):
+                    header_dic['需求'] = j
+                    demand = items[j]
+                elif re.search('预算', items[j]):
+                    header_dic['预算'] = j
+                    budget = items[j]
+                elif re.search('时间', items[j]):
+                    header_dic['时间'] = j
+                    order_time = items[j]
+
+            if header_dic.get('名称', "") != "" :
+                num = 0
+                for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
+                    if it != "":
+                        num  += 1
+                if num >=2:
+                    return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
         flag = False
-        return header_dic, flag, (product, quantity, unitPrice, brand, specs)
+        return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
 
-    def predict(self, docid='', html=''):
+    def predict(self, docid='', html='', page_time=""):
         '''
         正则寻找table表格内 产品相关信息
         :param html:公告HTML原文
@@ -1719,9 +1799,16 @@ class ProductAttributesPredictor():
         soup = BeautifulSoup(html, 'lxml')
         tables = soup.find_all(['table'])
         headers = []
+        headers_demand = []
         header_col = []
         product_link = []
-        for table in tables:
+        demand_link = []
+        for i in range(len(tables)-1, -1, -1):
+            table = tables[i]
+            if table.parent.name == 'td' and len(table.find_all('td')) == 1:
+                table.string = table.get_text()
+                table.name = 'turntable'
+                continue
             if not self.isTrueTable(table):
                 continue
             self.fixSpan(table)
@@ -1740,10 +1827,17 @@ class ProductAttributesPredictor():
                 unitPrice = ""  # 单价
                 brand = ""  # 品牌
                 specs = ""  # 规格
+                demand = ""  # 采购需求
+                budget = ""  # 预算金额
+                order_time = ""  # 采购时间
+                order_begin = ""
+                order_end = ""
+
                 if len(set(tds) & self.header_set) > len(tds) * 0.2:
-                    header_dic, found_header, header_list = self.find_header(tds, self.p1, self.p2)
+                    header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
                     if found_header:
                         headers.append('_'.join(header_list))
+                        headers_demand.append('_'.join(header_list2))
                         header_colnum = len(tds)
                         header_col.append('_'.join(tds))
                     i += 1
@@ -1757,6 +1851,10 @@ class ProductAttributesPredictor():
                     id3 = header_dic.get('单价', "")
                     id4 = header_dic.get('品牌', "")
                     id5 = header_dic.get('规格', "")
+
+                    id6 = header_dic.get('需求', "")
+                    id7 = header_dic.get('预算', "")
+                    id8 = header_dic.get('时间', "")
                     if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
                         product = tds[id1]
@@ -1768,8 +1866,10 @@ class ProductAttributesPredictor():
                         if id3 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                 unitPrice = tds[id3]
-                                if '万元' in header_list[2] and '万' not in unitPrice:
+                                if '万元' in header_list[2] and '万' not in unitPrice:
                                     unitPrice += '万元'
+                                unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
+                                unitPrice = str(getUnifyMoney(unitPrice))
                             else:
                                 unitPrice = ""
                         if id4 != "":
@@ -1782,16 +1882,74 @@ class ProductAttributesPredictor():
                                 specs = tds[id5]
                             else:
                                 specs = ""
+                        if id6 != "":
+                            if re.search('\w', tds[id6]):
+                                demand = tds[id6]
+                            else:
+                                demand = ""
+                        if id7 != "":
+                            if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
+                                budget = tds[id7]
+                                if '万元' in header_list2[2] and '万' not in budget:
+                                    budget += '万元'
+                                budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
+                                budget = str(getUnifyMoney(budget))
+                            else:
+                                budget = ""
+                        if id8 != "":
+                            if re.search('\w', tds[id8]):
+                                order_time = tds[id8].strip()
+                                if re.search('^\d{1,2}月$', order_time):
+                                    m = re.search('^(\d{1,2})月$', order_time).group(1)
+                                    if len(m) < 2:
+                                        m = '0'+m
+                                    year = re.search('(\d{4})年(.{,12}采购意向)?', html)
+                                    if year:
+                                        y = year.group(1)
+                                        num = self.get_monthlen(y, m)
+                                        if len(num)<2:
+                                            num = '0'+num
+                                        order_begin = "%s.%s.01" % (y, m)
+                                        order_end = "%s.%s.%s" % (y, m, num)
+                                    elif page_time!="":
+                                        year = re.search('\d{4}', page_time)
+                                        if year:
+                                            y = year.group(0)
+                                            num = self.get_monthlen(y, m)
+                                            if len(num) < 2:
+                                                num = '0'+num
+                                            order_begin = "%s.%s.01" % (y, m)
+                                            order_end = "%s.%s.%s" % (y, m, num)
+                                        else:
+                                            y = str(datetime.datetime.now().year)
+                                            num = self.get_monthlen(y, m)
+                                            if len(num) < 2:
+                                                num = '0'+num
+                                            order_begin = "%s.%s.01" % (y, m)
+                                            order_end = "%s.%s.%s" % (y, m, num)
+                                else:
+                                    order_begin, order_end = self.fix_time(order_time)
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                            # link = "{0}\t{1}\t{2}\t{3}\t{4}".format(product, quantity, unitPrice, brand, specs)
                             link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
-                                                      'brand': brand[:50], 'speces': specs[:100]}
+                                                      'brand': brand[:50], 'specs':specs}
                             if link not in product_link:
                                 product_link.append(link)
+                        if budget != "" and order_time != "" :
+                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
+                            if link not in demand_link:
+                                demand_link.append(link)
                     i += 1
                 else:
                     i += 1
-        return [{'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}]
+        if len(product_link)>0:
+            attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
+        else:
+            attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
+        if len(demand_link)>0:
+            demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
+        else:
+            demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
+        return [attr_dic, demand_dic]
 
 # docchannel类型提取
 class DocChannel():