Browse Source

修复某些公告产品属性提取

lsm 1 year ago
parent
commit
b44b448b53
2 changed files with 7 additions and 7 deletions
  1. BIN
      BiddingKG/dl/interface/header_set.pkl
  2. 7 7
      BiddingKG/dl/interface/predictor.py

BIN
BiddingKG/dl/interface/header_set.pkl


+ 7 - 7
BiddingKG/dl/interface/predictor.py

@@ -3072,11 +3072,11 @@ class ProductAttributesPredictor():
                     if product != "":
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
-                                if re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|万?元', tds[id2]):
+                                if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]):
                                     i += 1
                                     continue
                                 quantity = tds[id2]
-                            elif re.search('\w', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
+                            elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
                                 i += 1
                                 continue
                         if id2_2 != "":
@@ -3085,9 +3085,9 @@ class ProductAttributesPredictor():
                         if id3 != "":
                             if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                 unitPrice = tds[id3]
-                            elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id3].strip()):
+                            elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
                                 unitPrice = tds[id3]
-                            elif re.search('\w', tds[id3]) and re.search('^详见|^略', tds[id3])==None:
+                            elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id3])) > 5:
                                 i += 1
                                 continue
                         if id4 != "":
@@ -3119,9 +3119,9 @@ class ProductAttributesPredictor():
                         if id9 != "":
                             if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]):
                                 total_price = tds[id9]
-                            elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id9].strip()):
+                            elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
                                 total_price = tds[id9]
-                            elif re.search('\w', tds[id9]) and re.search('^详见|^略', tds[id9])==None:
+                            elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id9])) > 5:
                                 i += 1
                                 continue
                         if id10 != "":
@@ -3129,7 +3129,7 @@ class ProductAttributesPredictor():
                             if re.match('^详见|^略$', parameter.strip()):
                                 parameter = ""
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
-                            if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
+                            if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
                                 products = re.split('[;;、,\n]', tds[id1])
                                 quantitys = re.split('[;;、,\n]', tds[id2])
                                 unitPrices = re.split('[;;、,\n]', tds[id3])