Pārlūkot izejas kodu

Merge remote-tracking branch 'origin/master'

lsm 2 gadi atpakaļ
vecāks
revīzija
02c0295d7b

+ 8 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1048,8 +1048,12 @@ def tableToText(soup):
                 _td_len_list.append(len_td)
             if _td_len_list:
                 if len(list(set(_td_len_list))) >= 8 or max(_td_len_list) > 100:
+                    string_list = [re.sub("\s+","",i)for i in tbody.strings if i and i!='\n']
+                    tbody.string = ",".join(string_list)
+                    table_max_len = 30000
+                    tbody.string = tbody.string[:table_max_len]
+                    tbody.name = "turntable"
                     return None
-
         # fixSpan(tbody)
         # inner_table = getTable(tbody)
         # inner_table = fixTable(inner_table)
@@ -1059,7 +1063,8 @@ def tableToText(soup):
         inner_table = fixTable(inner_table)
 
         if inner_table == []:
-            tbody.string = segment(tbody,final=False)
+            string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
+            tbody.string = ",".join(string_list)
             table_max_len = 30000
             tbody.string = tbody.string[:table_max_len]
             # log('异常表格直接取全文')
@@ -1119,7 +1124,7 @@ def tableToText(soup):
             tag.extract()
     for ul in soup.find_all('ul'): #例子 156439663 多个不同channel 类别的标题
         if ul.find_all('li') == ul.findChildren(recursive=False) and len(set(re.findall(
-                '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|开标评标|资格评审',
+                '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|资格评审',
                 ul.get_text(), re.S)))>3:
             ul.extract()
 
@@ -1307,7 +1312,6 @@ def segment(soup,final=True):
     commaList = ["div","br","td","p","li"]
     #commaList = []
     spaceList = ["span"]
-
     tbodies = soup.find_all('tbody')
     if len(tbodies) == 0:
         tbodies = soup.find_all('table')

+ 8 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -2817,7 +2817,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     for _index in range(len(PackageList)):
         if "hit" in PackageList[_index]:
             for _hit in list(PackageList[_index]["hit"]):
-                _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
+                if len(_hit.split("-"))==3:
+                    _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
+                # 补充金额前新增负号‘-’导致错误的规则
+                elif len(_hit.split("-"))==4:
+                    _money = float(_hit.split("-")[2]) if _hit.split("-")[0] == "money" else None
+                else:
+                    _money = None
                 if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
                     dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
     #只找到一个中标人和中标金额
@@ -2827,7 +2833,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         # print('一个中标人一个金额:', list(set_tenderer_money)[0])
     #找到一个中标人和多个招标金额
     if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
-        _maxMoney = 0
+        _maxMoney = list(set_tenderer_money)[0]
         _sumMoney = 0
         for _m in list(set_tenderer_money):
             _sumMoney += _m

+ 5 - 1
BiddingKG/dl/ratio/re_ratio.py

@@ -49,7 +49,11 @@ def extract_ratio(text):
     # print(total_money_list)
     if total_money_list:
         for word, text_index in total_money_list:
-            num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十][零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]*(?:点[零壹贰叁肆伍陆柒捌玖一二三四五六七八九]+)?(?!分之)", word).group()
+            num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十][零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]*(?:点[零壹贰叁肆伍陆柒捌玖一二三四五六七八九]+)?(?!分之)", word)
+            if num_value:
+                num_value = num_value.group()
+            else:
+                continue
             if re.search("[零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]",num_value):
                 if '点' in num_value:
                     num_split = num_value.split("点")