Sfoglia il codice sorgente

产品配置参数提取等代码,提取率50%

luojiehua 1 anno fa
parent
commit
c33e152822

+ 1 - 0
.gitignore

@@ -5,3 +5,4 @@
 /attachmentProcessTime2.xlsx
 /BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx
 /BaseDataMaintenance/test/
+/BaseDataMaintenance/maintenance/product/download/

+ 42 - 28
BaseDataMaintenance/common/Utils.py

@@ -704,9 +704,10 @@ def getMultipleFactor(unit):
     '''
     @summary:拿到单位对应的值
     '''
-    MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
+    MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
     return MultipleFactor.get(unit)
 
+
 def getUnifyMoney(money):
     '''
     @summary:将中文金额字符串转换为数字金额
@@ -715,41 +716,54 @@ def getUnifyMoney(money):
     @return: decimal,数据金额
     '''
 
-
+    MAX_MONEY = 1000000000000
     MAX_NUM = 12
     #去掉逗号
     money = re.sub("[,,]","",money)
-    money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億〇一二三四五六七八九十百千万亿元角分]","",money)
+    money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]","",money)
     result = Decimal(0)
-    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
-    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
+    # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
+    chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
 
     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
-    if re.search(LowMoneypattern,money) is not None:
-        return Decimal(money)
-    elif re.search(BigMoneypattern,money) is not None:
-        return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
-    for factorUnit in chnFactorUnits:
-        if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
-            subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
-            if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
-                result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
-            elif len(subMoneys[0])==1:
-                if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
-                    result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
-            else:
-                result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
-
-            if len(subMoneys)>1:
-                if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
-                    result += Decimal(subMoneys[1])
-                elif len(subMoneys[1])==1:
-                    if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
-                        result += Decimal(getDigitsDic(subMoneys[1]))
+    try:
+        if re.search(LowMoneypattern,money) is not None:
+            return Decimal(money)
+        elif re.search(BigMoneypattern,money) is not None:
+            return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
+        for factorUnit in chnFactorUnits:
+            if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
+                subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
+                if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
+                    if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
+                        return Decimal(0)
+                    result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
+                elif len(subMoneys[0])==1:
+                    if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
+                        result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
+                # subMoneys[0]中无金额单位,不可再拆分
+                elif subMoneys[0]=="":
+                    result += 0
+                elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
+                    # print(subMoneys)
+                    # subMoneys[0] = subMoneys[0][0]
+                    result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
                 else:
-                    result += Decimal(getUnifyMoney(subMoneys[1]))
-            break
+                    result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
+                if len(subMoneys)>1:
+                    if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
+                        result += Decimal(subMoneys[1])
+                    elif len(subMoneys[1])==1:
+                        if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
+                            result += Decimal(getDigitsDic(subMoneys[1]))
+                    else:
+                        result += Decimal(getUnifyMoney(subMoneys[1]))
+                break
+    except Exception as e:
+        # traceback.print_exc()
+        return Decimal(0)
     return result
 
 

+ 11 - 11
BaseDataMaintenance/maintenance/product/1.py

@@ -2,14 +2,14 @@
 
 import re
 pattern="(^|★|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册部\.::]))|" \
-        "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>.{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
-        "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>.{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
-        "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>.{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>.{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>.{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>.{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
-        "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>.{,3}?(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
-        "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>.{,3}?(?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>)))|" \
-        "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>.{,3}?(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))" \
-""
-print(re.search(pattern,"9.球囊按压"))
+        "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
+        "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
+        "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
+        "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
+        "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
+        "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
+        "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
+        "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?(?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>)))|" \
+        "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[)]))" \
+        ""
+print(re.search(pattern,"(一)4K内窥镜荧光摄像系统主机").groupdict())

+ 108 - 30
BaseDataMaintenance/maintenance/product/htmlparser.py

@@ -6,6 +6,7 @@ import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
+
 from bs4 import BeautifulSoup
 import copy
 
@@ -144,7 +145,7 @@ class ParseDocument():
             if _id in self.set_tree_id:
                 continue
             self.set_tree_id.add(_id)
-            print(append,t["text"][:20])
+            print(append,t["text"][:50],t["sentence_title"])
             childs = t["child_title"]
             self.print_tree(childs,append=append+"  ")
 
@@ -154,15 +155,16 @@ class ParseDocument():
         return False
 
     def find_title_by_pattern(self,_text,_pattern="(^|★|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册部\.::]))|" \
-                                             "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>.{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
-                                             "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>.{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
-                                             "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>.{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
-                                             "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>.{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
-                                             "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>.{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
-                                             "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>.{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
-                                             "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>.{,3}?(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
-                                             "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>.{,3}?(?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>)))|" \
-                                             "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>.{,3}?(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))" \
+                                             "([\s★\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>))|" \
+                                             "([\s★\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]))|" \
+                                             "([\s★\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\..、\s\-]?))|"\
+                                             "([\s★\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
+                                             "([\s★\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
+                                             "([\s★\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]?))|" \
+                                             "([\s★\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-]?))|" \
+                                             "([\s★\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))]))|" \
+                                             "([\s★\*]*)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-wA-W]+)(?P<title_17_index_2_0>[))]))|" \
+                                             "([\s★\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))" \
                               ):
         _se = re.search(_pattern,_text)
         groups = []
@@ -172,7 +174,7 @@ class ParseDocument():
                 if v is not None:
                     groups.append((k,v))
         if len(groups):
-            groups.sort(key=lambda x:x[0])
+            # groups.sort(key=lambda x:x[0])
             return groups
         return None
 
@@ -190,10 +192,30 @@ class ParseDocument():
             _add = 0
         return next_chr+self.make_increase(_sort,_title[:-1],_add)
 
+
     def get_next_title(self,_title):
         if re.search("^\d+$",_title) is not None:
             return str(int(_title)+1)
         if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
+            if _title[-1]=="十":
+                return _title+"一"
+            if _title[-1]=="百":
+                return _title+"零一"
+
+            if _title[-1]=="九":
+                if len(_title)==1:
+                    return "十"
+                if len(_title)==2:
+                    if _title[0]=="十":
+                        return "二十"
+                if len(_title)==3:
+                    if _title[0]=="九":
+                        return "一百"
+                    else:
+                        _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title[0]))
+                        print("=_next_title",_next_title)
+                        return _next_title+"十"
+
             _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
             _next_title = list(_next_title)
             _next_title.reverse()
@@ -225,6 +247,34 @@ class ParseDocument():
                 return _sort[_index+1]
             return None
 
+    def count_title_before(self,list_obj):
+        dict_before = {}
+        for obj_i in range(len(list_obj)):
+            obj = list_obj[obj_i]
+            _type = "sentence"
+            _text = obj.text
+            if obj.name=="table":
+                _type = "table"
+                _text = str(obj)
+            _append = False
+
+
+            if _type=="sentence":
+                sentence_groups = self.find_title_by_pattern(_text[:10])
+                if sentence_groups:
+                    # c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
+                    sentence_title = sentence_groups[0][0]
+                    sentence_title_text = sentence_groups[0][1]
+                    title_index = sentence_groups[-2][1]
+                    title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
+                    title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
+                    next_index = self.get_next_title(title_index)
+                    if title_before not in dict_before:
+                        dict_before[title_before] = 0
+                    dict_before[title_before] += 1
+        return dict_before
+
+
     def buildParsetree(self,list_obj,auto_merge_table=True):
 
         self.parseTree = None
@@ -245,6 +295,7 @@ class ParseDocument():
         last_table_index = None
         last_table_columns = None
         last_table = None
+        dict_before = self.count_title_before(list_obj)
         for obj_i in range(len(list_obj)):
             obj = list_obj[obj_i]
             _type = "sentence"
@@ -272,12 +323,16 @@ class ParseDocument():
                 sentence_groups = self.find_title_by_pattern(_text[:10])
                 if sentence_groups:
                     # c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
-                    sentence_title = sentence_groups[0][0]
-                    sentence_title_text = sentence_groups[0][1]
-                    title_index = sentence_groups[-2][1]
-                    title_before = sentence_groups[1][1]
-                    title_after = sentence_groups[-1][1]
-                    next_index = self.get_next_title(title_index)
+                    title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
+                    if title_before in dict_before and dict_before[title_before]>1:
+                        sentence_title = sentence_groups[0][0]
+                        sentence_title_text = sentence_groups[0][1]
+                        title_index = sentence_groups[-2][1]
+
+                        title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
+                        next_index = self.get_next_title(title_index)
+                    else:
+                        title_before = None
 
             if _type=="sentence":
                 if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
@@ -298,7 +353,7 @@ class ParseDocument():
                     table_columns = len(list_table[0])
 
                     if auto_merge_table:
-                        if last_table_index is not None and abs(obj_i-last_table_index)<=1 and last_table_columns is not None and last_table_columns==table_columns:
+                        if last_table_index is not None and abs(obj_i-last_table_index)<=2 and last_table_columns is not None and last_table_columns==table_columns:
                             if last_table is not None:
                                 trs = getTrs(_table)
                                 last_tbody = BeautifulSoup(last_table["text"],"lxml")
@@ -340,14 +395,9 @@ class ParseDocument():
                         else:
                             _find = False
                             for i in range(1,len(list_data)+1):
-                                _d = list_data[-i]
-                                if i==1 and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
-                                    _data["parent_title"] = _d["parent_title"]
-                                    _d["title_next"] = _data
-                                    if _d["parent_title"] is not None:
-                                        _d["parent_title"]["child_title"].append(_data)
-                                    _find = True
+                                if _find:
                                     break
+                                _d = list_data[-i]
                                 if _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
                                     if _d["next_index"]==title_index and _d["title_next"] is None:
                                         _data["parent_title"] = _d["parent_title"]
@@ -356,6 +406,18 @@ class ParseDocument():
                                             _d["parent_title"]["child_title"].append(_data)
                                         _find = True
                                         break
+                            for i in range(1,len(list_data)+1):
+                                if _find:
+                                    break
+                                _d = list_data[-i]
+                                if i==1 and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
+                                    _data["parent_title"] = _d["parent_title"]
+                                    _d["title_next"] = _data
+                                    if _d["parent_title"] is not None:
+                                        _d["parent_title"]["child_title"].append(_data)
+                                    _find = True
+                                    break
+
                             if not _find:
                                 if len(list_data)>0:
                                     for i in range(1,len(list_data)+1):
@@ -594,8 +656,9 @@ def extract_parameters_by_table(_product,_param_pattern,list_data,_data_i,list_r
                                 list_result.append(_cell[0])
 
 def extract_product_parameters(list_data,_product):
-    _param_pattern = "配置要求|技术要求|技术参数|具体参数|规格参数|参数要求|技术需求|配置清单|(质量|技术).{,10}要求|明细及参数|验收标准|^参数$"
+    _param_pattern = "产品配置|配置要求|技术要求|技术参数|参数指标|具体参数|规格参数|参数要求|技术需求|配置清单|(质量|技术).{,10}要求|明细及参数|验收标准|^参数$"
     list_result = []
+    _product = _product.strip()
     products = extract_products(list_data,_product)
 
     _product = get_correct_product(_product,products)
@@ -633,19 +696,34 @@ def extract_product_parameters(list_data,_product):
     #     print("result%d"%i,list_result[i])
     list_result.sort(key=lambda x:len(re.findall('[^.][0-9a-zA-Z]+[^.]',x)), reverse=True)
 
-    return list_result[0] if len(list_result)>0 else None
+    print("+++++++++++++++++++++")
+    for i in range(len(list_result)):
+        print("result%d"%i,list_result[i])
+    print("+++++++++++++++++++++")
+
+    for _result in list_result:
+        _check = True
+        for p in products:
+            if _result.find(p)>0 and not is_similar(_product,p,80):
+                _check = False
+        if len(_result)<10:
+            _check = False
+        if _check:
+            return _result
+
+    return None
 
 
 if __name__ == '__main__':
 
-    _html = open("download/7421e0c9d12dc6290ead4040df0e3cd0.html", "r", encoding="utf8").read()
+    _html = open("download/107015f8e994683fd88827ad209f0d13.html", "r", encoding="utf8").read()
 
-    pd = ParseDocument(_html)
+    pd = ParseDocument(_html,True)
 
     list_data = pd.tree
     pd.print_tree(list_data)
 
 
-    _text = extract_product_parameters(list_data,"4K高清摄像系统")
+    _text = extract_product_parameters(list_data,"CT")
     print("extract_text",_text)
 

+ 15 - 3
BaseDataMaintenance/maintenance/product/product_attachment.py

@@ -18,6 +18,7 @@ parameter_status_no_bidfile = -1
 parameter_status_to_process = 0
 parameter_status_process_succeed = 1
 parameter_status_process_failed = 2
+parameter_status_process_jump = 3
 
 class Product_Attachment_Processor():
 
@@ -98,7 +99,12 @@ class Product_Attachment_Processor():
                 objectPath = atta.getProperties().get(attachment_path)
                 _filetype = atta.getProperties().get(attachment_filetype)
                 if _filetype in ("doc","xls"):
-                    continue
+                    if len(list_filemd5)==1:
+                        dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_process_jump,True)
+                        dp.update_row(self.ots_client)
+                        return
+                    else:
+                        continue
                 localpath = "%s/%s.%s"%(self.download_path,_filemd5,_filetype)
                 localhtml = "%s/%s.%s"%(self.download_path,_filemd5,"html")
                 download_succeed = False
@@ -126,7 +132,13 @@ class Product_Attachment_Processor():
                                     f.write(_html)
                         if _success:
                             if len(_html)>5:
-                                list_data = ParseDocument(_html).tree
+                                list_data = ParseDocument(_html,True).tree
+                                list_text = []
+                                for _product in list_product:
+                                    _text = extract_product_parameters(list_data,_product)
+                                    if _text is not None:
+                                        list_text.append(_text)
+                                list_data = ParseDocument(_html,False).tree
                                 list_text = []
                                 for _product in list_product:
                                     _text = extract_product_parameters(list_data,_product)
@@ -192,7 +204,7 @@ def change_parameters_status():
     ],
                            must_not_queries=[
         TermQuery("parameter_status",parameter_status_to_process),
-        TermQuery("parameter_status",parameter_status_process_succeed)
+        # TermQuery("parameter_status",parameter_status_process_succeed)
     ])
     list_data = []
     rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",