|
@@ -40,6 +40,12 @@ file = os.path.dirname(__file__) + '/agency_set.pkl'
|
|
with open(file, 'rb') as f:
|
|
with open(file, 'rb') as f:
|
|
agency_set = pickle.load(f)
|
|
agency_set = pickle.load(f)
|
|
|
|
|
|
|
|
+def is_agency(entity_text):
|
|
|
|
+ if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[()\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[()\w]{,4}(中心|服务所)|法院$',
|
|
|
|
+ entity_text) or entity_text in agency_set:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+
|
|
from threading import RLock
|
|
from threading import RLock
|
|
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
"prem":{"predictor":None,"Lock":RLock()},
|
|
"prem":{"predictor":None,"Lock":RLock()},
|
|
@@ -794,13 +800,13 @@ class PREMPredict():
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
label = 5
|
|
label = 5
|
|
elif label == 0:
|
|
elif label == 0:
|
|
- if re.search('拟邀请$|受邀谈判方', front):
|
|
|
|
|
|
+ if re.search('拟邀请$|受邀谈判方|直购企业:$', front):
|
|
label = 2
|
|
label = 2
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
- elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
|
|
|
|
|
|
+ elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
|
|
label = 1
|
|
label = 1
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
- elif re.search('采用$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
|
|
|
|
|
|
+ elif re.search('采用$|异议受理部门', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
|
|
label = 5
|
|
label = 5
|
|
elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
|
|
elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
|
|
label = 2
|
|
label = 2
|
|
@@ -824,14 +830,14 @@ class PREMPredict():
|
|
elif re.search('税费', front) and re.search('^承担', behind):
|
|
elif re.search('税费', front) and re.search('^承担', behind):
|
|
label = 5
|
|
label = 5
|
|
elif re.search('第一候补|第一后备|备选', front):
|
|
elif re.search('第一候补|第一后备|备选', front):
|
|
- label = 2
|
|
|
|
|
|
+ label = 3
|
|
values[label] = 0.6
|
|
values[label] = 0.6
|
|
elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
|
|
elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
|
|
values[2] = 0.5
|
|
values[2] = 0.5
|
|
label = 5
|
|
label = 5
|
|
- elif re.search('(承包权人|帐户名称):$', front):
|
|
|
|
|
|
+ elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位:
|
|
label = 5
|
|
label = 5
|
|
- elif re.search('合同供方:?$', front):
|
|
|
|
|
|
+ elif re.search('合同供方:?$|合同签约单位', front):
|
|
label = 0
|
|
label = 0
|
|
values[label] = 0.5
|
|
values[label] = 0.5
|
|
elif re.search('是否中标:是,供应商', front) and label == 5:
|
|
elif re.search('是否中标:是,供应商', front) and label == 5:
|
|
@@ -849,11 +855,13 @@ class PREMPredict():
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
elif re.search('^:受', behind): # 354009560 附件格式问题 ,中选中介服务机构通知书,编号:HZ2305120541,中汕项目管理有限公司:受惠东县人民政府大岭街道办事处委托
|
|
elif re.search('^:受', behind): # 354009560 附件格式问题 ,中选中介服务机构通知书,编号:HZ2305120541,中汕项目管理有限公司:受惠东县人民政府大岭街道办事处委托
|
|
label = 5
|
|
label = 5
|
|
- elif re.search('发布机构', front) and re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站|馆)$', entity.entity_text):
|
|
|
|
|
|
+ elif re.search('发布机构', front) and not is_agency(entity.entity_text):
|
|
label = 0
|
|
label = 0
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
|
|
elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
|
|
label = 5
|
|
label = 5
|
|
|
|
+ elif re.search('委托$', front) and re.search('^(抽样|送检|看样)', behind):
|
|
|
|
+ label = 5
|
|
elif label in [3,4]:
|
|
elif label in [3,4]:
|
|
if re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
|
|
if re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
|
|
label = 2
|
|
label = 2
|
|
@@ -922,7 +930,7 @@ class PREMPredict():
|
|
label = 2
|
|
label = 2
|
|
elif label == 1: # 错误中标金额处理
|
|
elif label == 1: # 错误中标金额处理
|
|
if re.search('[::,。](总金额|总价|单价|合价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
|
|
if re.search('[::,。](总金额|总价|单价|合价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
|
|
- values[label] = 0.49
|
|
|
|
|
|
+ values[label] = 0.5
|
|
elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
|
|
elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
|
|
elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
|
|
@@ -1376,7 +1384,7 @@ class RoleRulePredictor():
|
|
"(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)" # 解决表头识别不到加逗号情况,需前面为,。空
|
|
"(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)" # 解决表头识别不到加逗号情况,需前面为,。空
|
|
self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
|
|
self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
|
|
"(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
|
|
"(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
|
|
- "|结果公示如下:摇出球号:\d+号,中介机构:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
|
|
|
|
|
|
+ "|结果公示如下:摇出球号:\d+号,中介机构:$|直购企业:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
|
|
|
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
|
|
"^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
"^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
@@ -1423,7 +1431,7 @@ class RoleRulePredictor():
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价") # |建安费用 不作为招标金额
|
|
self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价") # |建安费用 不作为招标金额
|
|
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
|
|
|
|
+ self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
@@ -1451,8 +1459,9 @@ class RoleRulePredictor():
|
|
if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
|
|
if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
|
|
# print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
# print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
|
|
|
|
- if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
|
|
|
|
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
|
|
|
|
|
|
+ # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
|
|
|
|
+ # or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
|
|
|
|
+ if is_agency(entity_text):
|
|
_role = 'tenderee'
|
|
_role = 'tenderee'
|
|
else:
|
|
else:
|
|
_role = "agency"
|
|
_role = "agency"
|
|
@@ -1586,7 +1595,8 @@ class RoleRulePredictor():
|
|
break
|
|
break
|
|
if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
|
|
if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
|
|
_name) >= 0:
|
|
_name) >= 0:
|
|
- if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
|
|
|
|
|
|
+ # if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
|
|
|
|
+ if is_agency(p_entity.entity_text): # 2024/3/29 统一方法判断是否为代理
|
|
find_flag = True
|
|
find_flag = True
|
|
_label = 1
|
|
_label = 1
|
|
p_entity.label = _label
|
|
p_entity.label = _label
|
|
@@ -1819,8 +1829,11 @@ class RoleRuleFinalAdd():
|
|
:param list_codenames:
|
|
:param list_codenames:
|
|
:return:
|
|
:return:
|
|
'''
|
|
'''
|
|
|
|
+
|
|
# text_end = list_articles[0].content.split('##attachment##')[0][-40:]
|
|
# text_end = list_articles[0].content.split('##attachment##')[0][-40:]
|
|
main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
|
|
main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
|
|
|
|
+ if len(list_sentences[0])>0 and list_sentences[0][-1].in_attachment:
|
|
|
|
+ main_sentences = list_sentences[0][-1:] + main_sentences[-2:]
|
|
if len(main_sentences)==0:
|
|
if len(main_sentences)==0:
|
|
return 0
|
|
return 0
|
|
# end_tokens = []
|
|
# end_tokens = []
|
|
@@ -1834,7 +1847,19 @@ class RoleRuleFinalAdd():
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
sear_ent = re.search('([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
sear_ent = re.search('([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
if sear_ent:
|
|
if sear_ent:
|
|
|
|
+ b, e = sear_ent.span()
|
|
|
|
+ if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
|
|
|
|
+ sear_ent = None
|
|
break
|
|
break
|
|
|
|
+ if sear_ent == None:
|
|
|
|
+ text_end = list_articles[0].content[-100:]
|
|
|
|
+ sear_ent = re.search(
|
|
|
|
+ '([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?',
|
|
|
|
+ text_end)
|
|
|
|
+ if sear_ent:
|
|
|
|
+ b, e = sear_ent.span()
|
|
|
|
+ if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
|
|
|
|
+ sear_ent = None
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
@@ -1850,14 +1875,14 @@ class RoleRuleFinalAdd():
|
|
ents = []
|
|
ents = []
|
|
for ent in list_entitys[0]:
|
|
for ent in list_entitys[0]:
|
|
if ent.entity_type in ['org', 'company']:
|
|
if ent.entity_type in ['org', 'company']:
|
|
- if ent.label == 0 and ent.values[ent.label]>0.5:
|
|
|
|
|
|
+ if ent.label == 0 and ent.values[ent.label]>0.55:
|
|
if '公共资源交易中心' in ent.entity_text: # 公共资源交易中心不算招标或代理,只算平台
|
|
if '公共资源交易中心' in ent.entity_text: # 公共资源交易中心不算招标或代理,只算平台
|
|
# ent.label = 5
|
|
# ent.label = 5
|
|
ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率,不改类别,防止 382573066 明显招标人表达不提取
|
|
ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率,不改类别,防止 382573066 明显招标人表达不提取
|
|
continue
|
|
continue
|
|
tenderee_list.append(ent.entity_text)
|
|
tenderee_list.append(ent.entity_text)
|
|
tenderee_notfound = False
|
|
tenderee_notfound = False
|
|
- elif ent.label == 1:
|
|
|
|
|
|
+ elif ent.label == 1 and ent.values[ent.label]>0.55:
|
|
agency_list.append(ent.entity_text)
|
|
agency_list.append(ent.entity_text)
|
|
agency_notfound = False
|
|
agency_notfound = False
|
|
elif ent.label == 5:
|
|
elif ent.label == 5:
|
|
@@ -1869,33 +1894,24 @@ class RoleRuleFinalAdd():
|
|
ent_re = _sear_ent.group('entity')
|
|
ent_re = _sear_ent.group('entity')
|
|
ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
|
|
|
|
- if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
|
|
|
|
- or re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) == None) \
|
|
|
|
- and ent_re not in agency_list and ent_re not in agency_set:
|
|
|
|
|
|
+ if tenderee_notfound or agency_notfound:
|
|
n = 0
|
|
n = 0
|
|
for i in range(len(ents) - 1, -1, -1):
|
|
for i in range(len(ents) - 1, -1, -1):
|
|
if not ents[i].in_attachment:
|
|
if not ents[i].in_attachment:
|
|
n += 1
|
|
n += 1
|
|
if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
break
|
|
break
|
|
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
|
- ents[i].label = 0
|
|
|
|
- ents[i].values[0] = 0.51 # 修改为比标题概率略高
|
|
|
|
- tenderee_notfound = False
|
|
|
|
- # log('正则最后补充实体: %s'%(ent_re))
|
|
|
|
- break
|
|
|
|
- elif agency_notfound == True and ent_re not in tenderee_list and (
|
|
|
|
- re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
|
|
|
|
- n = 0
|
|
|
|
- for i in range(len(ents) - 1, -1, -1):
|
|
|
|
- if not ents[i].in_attachment:
|
|
|
|
- n += 1
|
|
|
|
- if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
|
|
|
|
+ elif _sear_ent==sear_ent and ents[i].label != 5: # 后面有角色的实体的停止继续往前
|
|
break
|
|
break
|
|
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
|
- ents[i].label = 1
|
|
|
|
- ents[i].values[1] = 0.51 # 修改为比标题概率略高
|
|
|
|
- agency_notfound = False
|
|
|
|
|
|
+ if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
|
+ if agency_notfound and is_agency(ents[i].entity_text) and ents[i].entity_text not in tenderee_list:
|
|
|
|
+ ents[i].label = 1
|
|
|
|
+ ents[i].values[1] = 0.51 # 修改为比标题概率略高
|
|
|
|
+ agency_notfound = False
|
|
|
|
+ elif tenderee_notfound and not is_agency(ents[i].entity_text) and ents[i].entity_text not in agency_list:
|
|
|
|
+ ents[i].label = 0
|
|
|
|
+ ents[i].values[0] = 0.51 # 修改为比标题概率略高
|
|
|
|
+ tenderee_notfound = False
|
|
# log('正则最后补充实体: %s'%(ent_re))
|
|
# log('正则最后补充实体: %s'%(ent_re))
|
|
break
|
|
break
|
|
if not tenderee_notfound:
|
|
if not tenderee_notfound:
|
|
@@ -2191,7 +2207,7 @@ class RoleGrade():
|
|
self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
|
|
self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
|
|
self.agency_left_9 = "(?P<agency_left_9>代理)"
|
|
self.agency_left_9 = "(?P<agency_left_9>代理)"
|
|
self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)"
|
|
self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)"
|
|
- self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
|
|
|
|
|
|
+ self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))" # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商,
|
|
self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
|
|
self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
|
|
self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
|
|
self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
|
|
self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
|
|
self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
|
|
@@ -2212,6 +2228,10 @@ class RoleGrade():
|
|
org_winner = []
|
|
org_winner = []
|
|
company_winner = []
|
|
company_winner = []
|
|
org_tenderee = []
|
|
org_tenderee = []
|
|
|
|
+ agency_l = []
|
|
|
|
+ agency_like_tenderee = [] # 类似招标人的代理人实体列表
|
|
|
|
+ low_prob_agency = []
|
|
|
|
+ low_prob_tenderee = []
|
|
for entity in list_entitys[0]:
|
|
for entity in list_entitys[0]:
|
|
if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
|
|
if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
|
|
text = sentences[entity.sentence_index].sentence_text
|
|
text = sentences[entity.sentence_index].sentence_text
|
|
@@ -2220,6 +2240,16 @@ class RoleGrade():
|
|
b = entity.wordOffset_begin
|
|
b = entity.wordOffset_begin
|
|
e = entity.wordOffset_end
|
|
e = entity.wordOffset_end
|
|
not_found = 1
|
|
not_found = 1
|
|
|
|
+ if re.search('(乙方:甲方:|甲方:乙方:)$', text[max(0, b-span):b]):
|
|
|
|
+ entity.label = 0 if entity.entity_type == 'org' else 2
|
|
|
|
+ entity.values[entity.label] = 0.55
|
|
|
|
+ continue
|
|
|
|
+ elif re.search('(采购|招标)人(?或(采购|招标)?代理机构)?:$', text[max(0, b-span):b]):
|
|
|
|
+ entity.label = 1 if is_agency(entity.entity_text) else 0
|
|
|
|
+ entity.values[entity.label] = 0.8
|
|
|
|
+ continue
|
|
|
|
+ elif re.search('(采购|招标|询比?价|遴选|寻源|比选)机构[是为:]+', text[max(0, b-span):b]) and not is_agency(entity.entity_text):
|
|
|
|
+ agency_like_tenderee.append(entity)
|
|
for pattern in self.pattern_list:
|
|
for pattern in self.pattern_list:
|
|
if 'left' in pattern:
|
|
if 'left' in pattern:
|
|
context = text[max(0, b-span):b]
|
|
context = text[max(0, b-span):b]
|
|
@@ -2262,10 +2292,31 @@ class RoleGrade():
|
|
company_winner.append(entity) # 保存中标人实体
|
|
company_winner.append(entity) # 保存中标人实体
|
|
if entity.label == 0 and entity.values[entity.label]> min_prob:
|
|
if entity.label == 0 and entity.values[entity.label]> min_prob:
|
|
org_tenderee.append(entity.entity_text) # 保存所有招标人名称
|
|
org_tenderee.append(entity.entity_text) # 保存所有招标人名称
|
|
- if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6: # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
|
|
|
|
- # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
|
|
|
|
|
|
+ elif entity.label == 1 and entity.values[entity.label]> min_prob:
|
|
|
|
+ agency_l.append(entity.entity_text)
|
|
|
|
+ # if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6: # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
|
|
|
|
+ # # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
|
|
|
|
+ # entity.label = 1
|
|
|
|
+ # entity.values[entity.label] = 0.5
|
|
|
|
+
|
|
|
|
+ elif entity.entity_type in ['org', 'company'] and entity.label in [1, 0] and 0.5<=entity.values[entity.label]<0.6:
|
|
|
|
+ if entity.label == 1:
|
|
|
|
+ low_prob_agency.append(entity)
|
|
|
|
+ else:
|
|
|
|
+ low_prob_tenderee.append(entity)
|
|
|
|
+
|
|
|
|
+ if org_tenderee == [] and agency_like_tenderee:
|
|
|
|
+ for entity in agency_like_tenderee:
|
|
|
|
+ entity.label = 0
|
|
|
|
+ entity.values[entity.label] = 0.6
|
|
|
|
+ for entity in low_prob_agency:
|
|
|
|
+ if entity.entity_text in org_tenderee:
|
|
|
|
+ entity.label = 0
|
|
|
|
+ entity.values[entity.label] = 0.6
|
|
|
|
+ for entity in low_prob_tenderee:
|
|
|
|
+ if entity.entity_text in agency_l:
|
|
entity.label = 1
|
|
entity.label = 1
|
|
- entity.values[entity.label] = 0.5
|
|
|
|
|
|
+ entity.values[entity.label] = 0.6
|
|
|
|
|
|
if org_winner != []:
|
|
if org_winner != []:
|
|
flag = 0
|
|
flag = 0
|
|
@@ -5874,8 +5925,10 @@ class TableTag2List():
|
|
if text_process != None:
|
|
if text_process != None:
|
|
# text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
|
|
# text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
|
|
# td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
# td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
- td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
|
|
|
|
- if len(td_text)>30:
|
|
|
|
|
|
+ td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
|
|
|
|
+ if 'title' in cell.attrs and cell.get_text().strip().endswith('...') and cell.get_text().strip()[:-3] in cell.attrs['title']:
|
|
|
|
+ td_text = cell.attrs['title'] # 修复 类似 215597851 省略号隐藏内容
|
|
|
|
+ elif len(td_text)>30:
|
|
td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
if td_text == "":
|
|
if td_text == "":
|
|
td_text = ' '
|
|
td_text = ' '
|
|
@@ -5953,7 +6006,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
|
|
|
|
def find_header(self, td_list):
|
|
def find_header(self, td_list):
|
|
- fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
|
|
|
|
+ fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
header_dic = dict()
|
|
header_dic = dict()
|
|
flag = False
|
|
flag = False
|
|
contain_header = False
|
|
contain_header = False
|
|
@@ -6059,14 +6112,14 @@ class TablePremExtractor(object):
|
|
return {}
|
|
return {}
|
|
for i in df.index:
|
|
for i in df.index:
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
- project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
|
|
|
|
- package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
|
|
|
|
- project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
|
|
|
|
- tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
|
|
|
|
- tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
|
|
|
|
- budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
|
|
|
|
- bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
|
|
|
|
- win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
|
|
|
+ project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
|
|
+ package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
|
|
|
|
+ project_name = df.loc[i, headers['project_name'][0]].strip() if "project_name" in headers else ""
|
|
|
|
+ tenderee = df.loc[i, headers['tenderee'][0]].strip() if "tenderee" in headers else ""
|
|
|
|
+ tenderer = df.loc[i, headers['tenderer'][0]].strip() if "tenderer" in headers else ""
|
|
|
|
+ budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else ""
|
|
|
|
+ bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
|
|
|
|
+ win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
|
|
|
|
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
# print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
|
|
# print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
|
|
@@ -6090,7 +6143,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
|
|
if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
|
|
break
|
|
break
|
|
- if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
|
|
|
|
|
|
+ if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and (re.search('否|未(中标|成交|中选)', win_sort) or win_sort==''): # 2024/04/2 修复 252208201 为空的不中标
|
|
continue
|
|
continue
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
continue
|
|
continue
|
|
@@ -6303,7 +6356,7 @@ class CandidateExtractor(object):
|
|
self.headerset = pickle.load(f)
|
|
self.headerset = pickle.load(f)
|
|
|
|
|
|
def find_header(self, td_list):
|
|
def find_header(self, td_list):
|
|
- fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
|
|
|
|
+ fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
header_dic = dict()
|
|
header_dic = dict()
|
|
flag = False
|
|
flag = False
|
|
contain_header = False
|
|
contain_header = False
|
|
@@ -6384,23 +6437,28 @@ class CandidateExtractor(object):
|
|
findtop3 = False
|
|
findtop3 = False
|
|
findmoney = False
|
|
findmoney = False
|
|
line_num = 0
|
|
line_num = 0
|
|
|
|
+ line_package = None
|
|
for i in df.index:
|
|
for i in df.index:
|
|
- package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
|
|
|
|
- candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
|
|
|
|
- win_or_not = df.loc[i, headers['win_or_not'][0]] if "win_or_not" in headers else ""
|
|
|
|
|
|
+ package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
|
|
|
|
+ candidate_ = df.loc[i, headers['candidate'][0]].strip() if "candidate" in headers else ""
|
|
|
|
+ win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
|
|
# budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
|
|
# budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
|
|
- bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
|
|
|
|
- win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
|
- win_tenderer = df.loc[i, headers['win_tenderer'][0]] if "win_tenderer" in headers else ""
|
|
|
|
- second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
|
|
|
|
- third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
|
|
|
|
|
|
+ bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
|
|
|
|
+ win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
|
|
|
|
+ win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
|
|
|
|
+ second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
|
|
|
|
+ third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
|
|
|
|
|
|
if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
|
|
if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
|
|
# print('包含表头, 停止匹配')
|
|
# print('包含表头, 停止匹配')
|
|
break
|
|
break
|
|
if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
|
|
if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
|
|
# print('全部为空或内容一样 停止匹配')
|
|
# print('全部为空或内容一样 停止匹配')
|
|
- break
|
|
|
|
|
|
+ if len(set(df.loc[i,:]))==1 and re.search('^第?([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段])([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?$', win_sort):
|
|
|
|
+ line_package = win_sort
|
|
|
|
+ continue
|
|
|
|
+ else:
|
|
|
|
+ break
|
|
|
|
|
|
if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
|
|
if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
|
|
col_indx = headers['candidate'][0] -1
|
|
col_indx = headers['candidate'][0] -1
|
|
@@ -6411,6 +6469,8 @@ class CandidateExtractor(object):
|
|
win_sort = pre_col
|
|
win_sort = pre_col
|
|
|
|
|
|
package_code = package_code_raw
|
|
package_code = package_code_raw
|
|
|
|
+ if package_code == '' and line_package:
|
|
|
|
+ package_code = line_package
|
|
|
|
|
|
# candidate = candidate_ if self.is_role(candidate_) else ""
|
|
# candidate = candidate_ if self.is_role(candidate_) else ""
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
@@ -6637,6 +6697,10 @@ class WebsourceTenderee():
|
|
'''
|
|
'''
|
|
p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
|
|
p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
|
|
web_ree = self.webno2ree.get(web_source_no, '')
|
|
web_ree = self.webno2ree.get(web_source_no, '')
|
|
|
|
+ if web_source_no.startswith('18591-') and web_ree == "":
|
|
|
|
+ web_ree = '中国人民解放军总医院'
|
|
|
|
+ elif web_source_no.startswith('Y00484-') and web_ree == "":
|
|
|
|
+ web_ree = '航空总医院'
|
|
if web_ree != '':
|
|
if web_ree != '':
|
|
if 'Project' in prem[0]['prem']:
|
|
if 'Project' in prem[0]['prem']:
|
|
find_tenderee = False
|
|
find_tenderee = False
|
|
@@ -7035,18 +7099,18 @@ if __name__=="__main__":
|
|
# rs = product_attr.predict(docid='', html=html, page_time="")
|
|
# rs = product_attr.predict(docid='', html=html, page_time="")
|
|
# print(rs)
|
|
# print(rs)
|
|
|
|
|
|
- docid = ""
|
|
|
|
- title = ''
|
|
|
|
- with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
|
- html = f.read()
|
|
|
|
- tb_extract = TablePremExtractor()
|
|
|
|
- rs = tb_extract.predict(html, [
|
|
|
|
- "广东省广裕集团嘉顺实业有限责任公司",
|
|
|
|
- "广州顺为招标采购有限公司",
|
|
|
|
- "中华人民共和国"
|
|
|
|
- ], web_source_name = '河钢供应链管理平台')
|
|
|
|
- print('标段数:',len(rs))
|
|
|
|
- print(rs)
|
|
|
|
|
|
+ # docid = ""
|
|
|
|
+ # title = ''
|
|
|
|
+ # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
|
+ # html = f.read()
|
|
|
|
+ # tb_extract = TablePremExtractor()
|
|
|
|
+ # rs = tb_extract.predict(html, [
|
|
|
|
+ # "广东省广裕集团嘉顺实业有限责任公司",
|
|
|
|
+ # "广州顺为招标采购有限公司",
|
|
|
|
+ # "中华人民共和国"
|
|
|
|
+ # ], web_source_name = '河钢供应链管理平台')
|
|
|
|
+ # print('标段数:',len(rs))
|
|
|
|
+ # print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|