|
- '''
- Created on 2018年12月29日
- @author: User
- '''
- from BiddingKG.dl.common.Utils import *
- import json
- from bs4 import BeautifulSoup
- import re
- class RelationsTree():
- '''
- @summary: make a attribute tree
- '''
- def __init__(self):
- self.tree = dict()
- self.nodes = dict()
- self.nodes["ROOT"] = self.tree
- def add_relation(self,relation,parent,child):
- if parent in self.nodes:
- _relation = relation+"_"+str(parent)+str(child)
- self.nodes[parent][_relation] = dict()
- self.nodes[child] = self.nodes[parent][_relation]
- pattern_attachment = re.compile("\.(?P<attachment>jpg|jpeg|png|swf|tif|pdf|doc|docx|xls|xlsx|zip|rar|tar|7z|wim)$")
- class Article():
- '''
- @summary:文章类
- '''
-
- def __init__(self,id,content,sourceContent,doc_id,title,code="",name="",bidway=""):
- '''
- @param:
- id:文章的uuid
- content:文章经过预处理之后的文本
- '''
- self.id = id
- self.content = content
- self.code = code
- self.name = name
- self.sourceContent = sourceContent
- self.doc_id = doc_id
- self.title = title
- self.fingerprint = ""
- self.match_enterprise = []
- self.match_enterprise_type = 0
- self.attachmentTypes = self.getAttachmentTypes(sourceContent)
- self.bidway = bidway
- def toJson(self):
- _dict = {"id":self.id,"content":self.content,"code":self.code,
- "name":self.name,"sourceContent":"self.sourceContent","doc_id":self.doc_id,"title":self.title}
- return json.dumps(_dict)
- @staticmethod
- def fromJson(_json):
- _dict = json.loads(_json)
- return Article(_dict.get("id"),_dict.get("content"),
- _dict.get("sourceContent"),_dict.get("doc_id"),_dict.get("title"),_dict.get("code"),_dict.get("name"))
- @staticmethod
- def getAttachmentTypeFromUrl(url):
- _match = re.search(pattern_attachment,url)
- if _match is not None:
- return _match.groupdict().get("attachment")
- return None
- @staticmethod
- def getAttachmentTypes(sourceHtml):
- _soup = BeautifulSoup(sourceHtml,"lxml")
- set_types = set()
- list_a = _soup.find_all("a")
- for _a in list_a:
- _url = _a.attrs.get("href","")
- _type = Article.getAttachmentTypeFromUrl(_url)
- if _type is not None:
- set_types.add(_type)
- list_img = _soup.find_all("img")
- for _img in list_img:
- _url = _img.attrs.get("src","")
- _type = Article.getAttachmentTypeFromUrl(_url)
- if _type is not None:
- set_types.add(_type)
- return ",".join(list(set_types))
-
-
- class Sentences():
- '''
- @summary:句子类
- '''
-
- def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags,in_attachment=False):
- '''
- @param:
- doc_id:文章的uuid
- sentence_index:文章的句子编号
- sentence_text:句子内容
- tokens:句子分词
- pos_tags:词性标注(算法目前没有用到,暂为空)
- ner_tags:实体识别
-
- '''
- self.doc_id = doc_id
- self.sentence_index = sentence_index
- self.sentence_text = sentence_text
- self.tokens = tokens
- self.pos_tags = pos_tags
- self.ner_tags = ner_tags
- self.in_attachment = in_attachment # 2022/02/10添加,句子是否在附件中
- def toJson(self):
- _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text,
- "tokens":self.tokens,"pos_tags":self.pos_tags,"ner_tags":self.ner_tags}
- return json.dumps(_dict)
- @staticmethod
- def fromJson(_json):
- _dict = json.loads(_json)
- return Sentences(_dict.get("doc_id"),_dict.get("sentence_index"),_dict.get("sentence_text"),_dict.get("tokens"),
- _dict.get("pos_tags"),_dict.get("ner_tags"))
- class Outline():
- '''
- @summary:根据正则划分公告的大纲
- '''
- def __init__(self, doc_id,outline_index,outline_text, sentence_begin_index, sentence_end_index,wordOffset_begin,wordOffset_end):
- '''
- @param:
- doc_id:文章的uuid
- sentence_begin_index:开始的句子索引
- sentence_end_index:结束的句子索引
- wordOffset_begin:开始句子的起始字索引
- wordOffset_end:结束句子的结尾字索引
- outline_text:大纲的全文本
- outline_summary:大纲的概要
- '''
- self.doc_id = doc_id
- self.outline_index = outline_index
- self.outline_text = outline_text
- self.sentence_begin_index = sentence_begin_index
- self.sentence_end_index = sentence_end_index
- self.wordOffset_begin = wordOffset_begin
- self.wordOffset_end = wordOffset_end
- self.outline_summary = ""
- class Entity():
- '''
- @summary:实体类
- '''
-
- def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False, prob=0):
- '''
- @param:
- doc_id:文章的uuid
- entity_id:相同实体类型的实体的唯一值,由于抽取算法不唯一,不同类型的实体可能拥有相同entity_id
- entity_text:实体的内容
- entity_type:实体类型
- sentence_index:句子下标
- begin_index:实体所在句子的开始位置
- end_index:实体所在句子的结束位置
- label:实体所属类别
- value:实体的各类别概率值
- '''
- self.doc_id = doc_id
- self.entity_id = entity_id
- self.entity_text = entity_text
- self.entity_type = entity_type
- self.sentence_index = sentence_index
- self.begin_index = begin_index
- self.end_index = end_index
- self.wordOffset_begin = wordOffset_begin
- self.wordOffset_end = wordOffset_end
- self.label = label
- self.values = values
- self.handlabel = True
- self.packageName = "Project"
- self.packageCode = ""
- self.roleName = ""
- self.linked_entitys = []
- self.pointer_pack = None
- self.pointer_money = None
- # self.pointer_person = None
- self.pointer_person = []
- self.pointer_address = None
- self.pointer_tendereeMoney = None
- # self.person_phone = person_phone
- self.person_phone = []
- self.pointer_email = None
- self.is_tail = False
- self.notes = '' # 2021/7/20 新增,保存金额大小写,单位等备注
- self.money_unit = '' # 2021/8/17 新增,保存金额单位 元、万元 、亿元
- self.if_dict_match = 0 # 2021/12/21 新增,判断公司实体是否由字典识别得到
- self.is_total_money = 0 # 2021/12/29 新增,判断金额是否总价
- self.is_unit_money = 0 # 2021/12/29 新增,判断金额是否单价
- self.pointer_serviceTime = None # 2022/01/05 新增,中标人对应链接"服务期限(工期)"
- self.pointer_ratio = None # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率"
- self.origin_entity_text = '' # 2022/1/5 新增,记录字典替换的原来的实体名
- self.in_attachment = in_attachment # 2022/02/10添加,实体是否在附件中
- self.prob = prob # 2022/06/20添加,实体的概率
- self.ratio_value = None # 2022/10/18 新增费率处理数据,(value,ratio_type) 费率数值,类型
- def set_Role(self,role_label,role_values):
- self.label = int(role_label)
- self.values = [float(i) for i in role_values]
-
- def set_Money(self,money_label,money_values):
- self.label = int(money_label)
- self.values = [float(i) for i in money_values]
-
- def set_Person(self,person_label,person_values,person_phone):
- self.label = int(person_label)
- self.values = [float(i) for i in person_values]
- self.person_phone = person_phone
- def toJson(self):
- _dict = {"doc_id":self.doc_id,"entity_id":self.entity_id,"entity_text":self.entity_text,
- "entity_type":self.entity_type,"sentence_index":self.sentence_index,"begin_index":self.begin_index,
- "end_index":self.end_index,"wordOffset_begin":self.wordOffset_begin,"wordOffset_end":self.wordOffset_end,
- "label":int(self.label) if self.label is not None else None,"values":self.values,"person_phone":self.person_phone}
- return json.dumps(_dict)
- @staticmethod
- def fromJson(_json):
- _dict = json.loads(_json)
- return Entity(_dict.get("doc_id"),_dict.get("entity_id"),_dict.get("entity_text"),_dict.get("entity_type"),
- _dict.get("sentence_index"),_dict.get("begin_index"),_dict.get("end_index"),_dict.get("wordOffset_begin"),
- _dict.get("wordOffset_end"),_dict.get("label"),_dict.get("values"),_dict.get("person_phone"))
-
- class PREM():
- '''
- @summary:包-标段号-角色-公司实体-金额-金额概率-联系人-联系人概率-联系电话
- '''
-
- def __init__(self,packageName,packageCode,role_name,entity_text,role_prob,money,money_prob,linklist):
- '''
- @param:
- packageName:包名
- packageCode:标段号
- role_name:角色名称
- entity_text:公司实体名称
- role_prob:角色概率
- money:金额
- money_prob:金额概率
- linklist:联系list[联系人,联系电话]
- '''
- self.packageName = packageName
- self.packageCode = packageCode
- self.role_name = role_name
- self.entity_text = entity_text
- self.role_prob = role_prob
- self.money = money
- self.money_prob = money_prob
- self.linklist = linklist
- self.multi_winner = set() # 2024/4/8 #添加多中标人
-
- def getString(self,roleList):
- '''
- #不再在这里解决冲突
- count = 0
- for item in roleList:
- if item.entity_text==self.entity_text:
- if item.packageName==self.packageName:
- count += 1
- else:
- if "Project" in [item.packageName,self.packageName]:
- count += 1
- if count==1:
- self.linklist = [item for item in set(self.linklist)]
- result = [self.packageName,self.packageCode,self.role_name,self.entity_text,self.money,self.linklist]
- else:
- result = None
- '''
- self.linklist = [item for item in set(self.linklist)]
- result = [self.packageName,self.packageCode,self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
- return result
-
- class Role():
- '''
- @summary: 定义一个角色拥有的所有属性
- '''
-
- def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist, multi_winner):
-
- self.role_name = role_name
- self.entity_text = entity_text
- self.role_prob = role_prob
- self.money = money
- self.money_prob = money_prob
- self.linklist = linklist
- self.money_unit = '' # 2021/8/17 新增 保存金额单位
- # 中投标人属性
- self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
- self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
- self.address = "" #2022/08/08 新增 角色地址
- self.multi_winner = multi_winner #2024/4/8 新增多中标人
- def getString(self):
- self.linklist = [item for item in set(self.linklist)]
- # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
- # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
- # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
- floating_ratio = "" # 上浮率
- downward_floating_ratio = "" # 下浮率
- discount_ratio = "" # 折扣率/费率
- if self.ratio:
- # num_value = re.search("\d+(?:\.\d+)?",self.ratio).group()
- # num_value = float(num_value)
- # _decimal = str(num_value).split('.')[1]
- # if _decimal=='0':
- # round_len = 0
- # else:
- # round_len = len(_decimal)
- # if re.search("%|百分之",self.ratio):
- # num_value = num_value * 0.01
- # round_len += 2
- # elif re.search("‰|千分之",self.ratio):
- # num_value = num_value * 0.001
- # round_len += 3
- # num_value = str(round(num_value,round_len))
- #
- # if re.search("上浮",self.ratio):
- # floating_ratio = num_value
- # elif re.search("下浮",self.ratio):
- # downward_floating_ratio = num_value
- # else:
- # discount_ratio = num_value
- ratio_type = self.ratio[1]
- ratio_value = str(self.ratio[0])
- if ratio_type=='floating_ratio':
- floating_ratio = ratio_value
- elif ratio_type=='downward_floating_ratio':
- downward_floating_ratio = ratio_value
- elif ratio_type=='discount_ratio':
- discount_ratio = ratio_value
- result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
- 'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
- 'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
- if result['role_name'] == 'tenderee':
- result['role_prob'] = self.role_prob
- if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
- self.multi_winner.add(result['role_text'])
- result['multi_winner'] = ','.join(self.multi_winner)
- return result
- # 用于KM算法的组合配对
- class Match():
- def __init__(self,main_role,attribute,value=None):
- '''
- :param main_role: 主要的角色
- :param attribute: 角色的属性
- :param value: 相匹配的权值
- '''
- self.main_role = main_role
- self.attribute = attribute
- self.value = value
- if __name__=="__main__":
- a = Article(1,[0.0026275085, 9.795774e-05, 0.00066399743, 0.99661046],"2","4","5")
- b = Article.fromJson(a.toJson())
- print(b.toJson())
|