|
@@ -53,7 +53,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
"industry": {"predictor": None, "Lock": RLock()},
|
|
|
"rolegrade": {"predictor": None, "Lock": RLock()},
|
|
|
"moneygrade": {"predictor": None, "Lock": RLock()},
|
|
|
- "district": {"predictor": None, "Lock": RLock()}
|
|
|
+ "district": {"predictor": None, "Lock": RLock()},
|
|
|
+ 'tableprem': {"predictor": None, "Lock": RLock()},
|
|
|
}
|
|
|
|
|
|
|
|
@@ -97,6 +98,8 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]["predictor"] = MoneyGrade()
|
|
|
if _type == 'district':
|
|
|
dict_predictor[_type]["predictor"] = DistrictPredictor()
|
|
|
+ if _type == 'tableprem':
|
|
|
+ dict_predictor[_type]["predictor"] = TablePremExtractor()
|
|
|
return dict_predictor[_type]["predictor"]
|
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
@@ -4317,6 +4320,303 @@ class DistrictPredictor():
|
|
|
rs = rs2
|
|
|
return rs
|
|
|
|
|
|
+class TablePremExtractor(object):
|
|
|
+ def __init__(self):
|
|
|
+ '''各要素表头规则'''
|
|
|
+ self.head_rule_dic = {
|
|
|
+ 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
+ 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
|
|
|
+ "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
+ "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
+ "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
+ "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ }
|
|
|
+
|
|
|
+ with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
+ self.headerset = pickle.load(f)
|
|
|
+
|
|
|
+ def table2list(self, table):
|
|
|
+ self._output = []
|
|
|
+ row_ind = 0
|
|
|
+ col_ind = 0
|
|
|
+ for row in table.find_all('tr'):
|
|
|
+ # record the smallest row_span, so that we know how many rows
|
|
|
+ # we should skip
|
|
|
+ smallest_row_span = 1
|
|
|
+
|
|
|
+ for cell in row.children:
|
|
|
+ if cell.name in ('td', 'th'):
|
|
|
+ # check multiple rows
|
|
|
+ # pdb.set_trace()
|
|
|
+ row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and re.search('[0-9]', cell.get('rowspan')) else 1
|
|
|
+
|
|
|
+ # try updating smallest_row_span
|
|
|
+ smallest_row_span = min(smallest_row_span, row_span)
|
|
|
+
|
|
|
+ # check multiple columns
|
|
|
+ col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and re.search('[0-9]', cell.get('colspan')) else 1
|
|
|
+
|
|
|
+ # find the right index
|
|
|
+ while True:
|
|
|
+ if self._check_cell_validity(row_ind, col_ind):
|
|
|
+ break
|
|
|
+ col_ind += 1
|
|
|
+
|
|
|
+ # insert into self._output
|
|
|
+ try:
|
|
|
+ text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
|
|
|
+ text = re.sub('\s', '', text)[:200] # 只需取前200字即可
|
|
|
+ self._insert(row_ind, col_ind, row_span, col_span, text)
|
|
|
+ except UnicodeEncodeError:
|
|
|
+ raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
|
|
|
+
|
|
|
+ # update col_ind
|
|
|
+ col_ind += col_span
|
|
|
+ if col_ind > 50: # 表格列数大于50的去掉
|
|
|
+ return []
|
|
|
+
|
|
|
+ # update row_ind
|
|
|
+ row_ind += smallest_row_span
|
|
|
+ col_ind = 0
|
|
|
+ return self._output
|
|
|
+
|
|
|
+ def _check_validity(self, i, j, height, width):
|
|
|
+ """
|
|
|
+ check if a rectangle (i, j, height, width) can be put into self.output
|
|
|
+ """
|
|
|
+ return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width))
|
|
|
+
|
|
|
+ def _check_cell_validity(self, i, j):
|
|
|
+ """
|
|
|
+ check if a cell (i, j) can be put into self._output
|
|
|
+ """
|
|
|
+ if i >= len(self._output):
|
|
|
+ return True
|
|
|
+ if j >= len(self._output[i]):
|
|
|
+ return True
|
|
|
+ if self._output[i][j] is None:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def _insert(self, i, j, height, width, val):
|
|
|
+ # pdb.set_trace()
|
|
|
+ for ii in range(i, i+height):
|
|
|
+ for jj in range(j, j+width):
|
|
|
+ self._insert_cell(ii, jj, val)
|
|
|
+
|
|
|
+ def _insert_cell(self, i, j, val):
|
|
|
+ while i >= len(self._output):
|
|
|
+ self._output.append([])
|
|
|
+ while j >= len(self._output[i]):
|
|
|
+ self._output[i].append("")
|
|
|
+
|
|
|
+ if self._output[i][j] == "":
|
|
|
+ self._output[i][j] = val
|
|
|
+
|
|
|
+ def find_header(self, td_list):
|
|
|
+ header_dic = dict()
|
|
|
+ flag = False
|
|
|
+ if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>0.6:
|
|
|
+ flag = True
|
|
|
+ for i in range(len(td_list)) :
|
|
|
+ text = td_list[i]
|
|
|
+ if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
+ continue
|
|
|
+ if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
+ return flag, dict()
|
|
|
+ num = 0
|
|
|
+ for k, v in self.head_rule_dic.items():
|
|
|
+ if re.search(v, text):
|
|
|
+ header_dic[k] = (i, text)
|
|
|
+ num += 1
|
|
|
+ if num>1:
|
|
|
+ print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
+ return flag, dict()
|
|
|
+ if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
+ 'budget' in header_dic or 'tenderer' in header_dic):
|
|
|
+ return flag, header_dic
|
|
|
+ return flag, dict()
|
|
|
+
|
|
|
+ def is_role(self, text):
|
|
|
+ if len(text) > 25 or len(text)<5:
|
|
|
+ return False
|
|
|
+ elif len(re.findall('有限责?任?公司', text)) > 1:
|
|
|
+ return False
|
|
|
+ elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ ners = selffool.ner(text)
|
|
|
+ if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def extract_from_df(self, df, headers):
|
|
|
+ prem_dic = {}
|
|
|
+ previous_package = "" # 上一行包号
|
|
|
+ multi_same_package = False # 非连续的重复包号
|
|
|
+ package_fix2raw = dict() # 处理后包号:处理前包号 字典
|
|
|
+ link_set = set()
|
|
|
+ for i in df.index:
|
|
|
+ same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
|
+ project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
|
|
|
+ package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
|
|
|
+ project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
|
|
|
+ tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
|
|
|
+ tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
|
|
|
+ budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
|
|
|
+ bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
|
|
|
+ win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
+
|
|
|
+ package_code = package_code_raw
|
|
|
+ if re.search('合计|总计', package_code+project_code):
|
|
|
+ continue
|
|
|
+ if package_code != '' and package_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
+ same_package = True
|
|
|
+ project_name = ''
|
|
|
+ previous_package = package_code
|
|
|
+
|
|
|
+ if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
|
+ continue
|
|
|
+ if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
|
|
|
+ continue
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and 'bid_amount' in headers and re.search('(中标|成交)价', headers['bid_amount'][1])==None:
|
|
|
+ tenderer = ""
|
|
|
+
|
|
|
+ tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
+ tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
+
|
|
|
+ if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
+ break
|
|
|
+ if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
|
|
|
+ continue
|
|
|
+ link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
+
|
|
|
+ package = package_code if package_code else str(i+1)
|
|
|
+ package = uniform_package_name(package)
|
|
|
+
|
|
|
+ if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
|
+ package_fix2raw[package] = package_code_raw
|
|
|
+ elif same_package == False:
|
|
|
+ multi_same_package = True
|
|
|
+ if multi_same_package:
|
|
|
+ package = package_code_raw
|
|
|
+ if package not in prem_dic or not same_package:
|
|
|
+ prem_dic[package] = {
|
|
|
+ 'code': '',
|
|
|
+ 'name': '',
|
|
|
+ 'roleList': [],
|
|
|
+ 'tendereeMoney': 0,
|
|
|
+ 'tendereeMoneyUnit': ""
|
|
|
+ }
|
|
|
+
|
|
|
+ prem_dic[package]['code'] = project_code
|
|
|
+ prem_dic[package]['name'] = project_name
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", budget_)
|
|
|
+ if re_price:
|
|
|
+ budget_ = re_price[0]
|
|
|
+ if '万元' in headers['budget'][1] and '万' not in budget_:
|
|
|
+ budget_ += '万元'
|
|
|
+ budget = float(str(getUnifyMoney(budget_)))
|
|
|
+ if budget > 10000000000000: # 大于万亿的去除
|
|
|
+ budget = 0
|
|
|
+ if same_package and prem_dic[package]['tendereeMoney'] != budget: #
|
|
|
+ prem_dic[package]['tendereeMoney'] += budget
|
|
|
+ else:
|
|
|
+ prem_dic[package]['tendereeMoney'] = budget
|
|
|
+ prem_dic[package]['tendereeMoneyUnit'] = '万元' if '万' in budget_ else '元'
|
|
|
+ if tenderee and not same_package:
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": 0,
|
|
|
+ "money_unit": ""
|
|
|
+ },
|
|
|
+ "role_name": "tenderee",
|
|
|
+ "role_text": tenderee,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ if tenderer and not same_package:
|
|
|
+ bid_amount = 0
|
|
|
+ money_unit = ""
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", bid_amount_)
|
|
|
+ if re_price:
|
|
|
+ bid_amount_ = re_price[0]
|
|
|
+ if '万元' in headers['bid_amount'][1] and '万' not in bid_amount_:
|
|
|
+ bid_amount_ += '万元'
|
|
|
+ bid_amount = float(str(getUnifyMoney(bid_amount_)))
|
|
|
+ if bid_amount > 10000000000000: # 大于万亿的去除
|
|
|
+ bid_amount = 0
|
|
|
+ money_unit = '万元' if '万' in bid_amount_ else '元'
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": bid_amount,
|
|
|
+ "money_unit": money_unit
|
|
|
+ },
|
|
|
+ "role_name": "win_tenderer",
|
|
|
+ "role_text": tenderer,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃
|
|
|
+ prem_dic.pop(package)
|
|
|
+ if multi_same_package:
|
|
|
+ for k, v in package_fix2raw.items():
|
|
|
+ if k in prem_dic:
|
|
|
+ prem_dic[v] = prem_dic.pop(k)
|
|
|
+ return prem_dic
|
|
|
+
|
|
|
+ def get_prem(self, soup):
|
|
|
+ tables = soup.find_all('table')
|
|
|
+ tables.reverse()
|
|
|
+ rs_dic = {}
|
|
|
+ for table in tables:
|
|
|
+ trs = self.table2list(table)
|
|
|
+ table.extract()
|
|
|
+ i = 0
|
|
|
+ headers = ""
|
|
|
+ while i < len(trs) - 1:
|
|
|
+ flag_, headers_ = self.find_header(trs[i])
|
|
|
+ if flag_ and headers_ != dict():
|
|
|
+ table_items = []
|
|
|
+ headers = headers_
|
|
|
+ for j in range(i + 1, len(trs)):
|
|
|
+ if len(trs[j]) == len(trs[i]):
|
|
|
+ flag_, headers_ = self.find_header(trs[j])
|
|
|
+ if flag_:
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ table_items.append(trs[j])
|
|
|
+ else:
|
|
|
+ print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
|
|
|
+ break
|
|
|
+ if len(table_items) > 1:
|
|
|
+ df = pd.DataFrame(table_items)
|
|
|
+ prem_ = self.extract_from_df(df, headers)
|
|
|
+ rs_dic.update(prem_)
|
|
|
+ i = j - 1
|
|
|
+ i += 1
|
|
|
+ return rs_dic
|
|
|
+
|
|
|
+ def predict(self, html):
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
+ richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
+ if richText:
|
|
|
+ richText = richText.extract() # 过滤掉附件
|
|
|
+ prem = self.get_prem(soup)
|
|
|
+ if prem == {} and richText:
|
|
|
+ prem = self.get_prem(richText)
|
|
|
+ return prem
|
|
|
+
|
|
|
|
|
|
def getSavedModel():
|
|
|
#predictor = FormPredictor()
|