''' Created on 2018年12月29日 @author: User ''' from BiddingKG.dl.common.Utils import * import json from bs4 import BeautifulSoup import re class RelationsTree(): ''' @summary: make a attribute tree ''' def __init__(self): self.tree = dict() self.nodes = dict() self.nodes["ROOT"] = self.tree def add_relation(self,relation,parent,child): if parent in self.nodes: _relation = relation+"_"+str(parent)+str(child) self.nodes[parent][_relation] = dict() self.nodes[child] = self.nodes[parent][_relation] pattern_attachment = re.compile("\.(?Pjpg|jpeg|png|swf|tif|pdf|doc|docx|xls|xlsx|zip|rar|tar|7z|wim)$") class Article(): ''' @summary:文章类 ''' def __init__(self,id,content,sourceContent,doc_id,title,code="",name="",bidway=""): ''' @param: id:文章的uuid content:文章经过预处理之后的文本 ''' self.id = id self.content = content self.code = code self.name = name self.sourceContent = sourceContent self.doc_id = doc_id self.title = title self.fingerprint = "" self.match_enterprise = [] self.match_enterprise_type = 0 self.attachmentTypes = self.getAttachmentTypes(sourceContent) self.bidway = bidway def toJson(self): _dict = {"id":self.id,"content":self.content,"code":self.code, "name":self.name,"sourceContent":"self.sourceContent","doc_id":self.doc_id,"title":self.title} return json.dumps(_dict) @staticmethod def fromJson(_json): _dict = json.loads(_json) return Article(_dict.get("id"),_dict.get("content"), _dict.get("sourceContent"),_dict.get("doc_id"),_dict.get("title"),_dict.get("code"),_dict.get("name")) @staticmethod def getAttachmentTypeFromUrl(url): _match = re.search(pattern_attachment,url) if _match is not None: return _match.groupdict().get("attachment") return None @staticmethod def getAttachmentTypes(sourceHtml): _soup = BeautifulSoup(sourceHtml,"lxml") set_types = set() list_a = _soup.find_all("a") for _a in list_a: _url = _a.attrs.get("href","") _type = Article.getAttachmentTypeFromUrl(_url) if _type is not None: set_types.add(_type) list_img = _soup.find_all("img") for _img in list_img: _url = _img.attrs.get("src","") _type = Article.getAttachmentTypeFromUrl(_url) if _type is not None: set_types.add(_type) return ",".join(list(set_types)) class Sentences(): ''' @summary:句子类 ''' def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags,in_attachment=False): ''' @param: doc_id:文章的uuid sentence_index:文章的句子编号 sentence_text:句子内容 tokens:句子分词 pos_tags:词性标注(算法目前没有用到,暂为空) ner_tags:实体识别 ''' self.doc_id = doc_id self.sentence_index = sentence_index self.sentence_text = sentence_text self.tokens = tokens self.pos_tags = pos_tags self.ner_tags = ner_tags self.in_attachment = in_attachment # 2022/02/10添加,句子是否在附件中 def toJson(self): _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text, "tokens":self.tokens,"pos_tags":self.pos_tags,"ner_tags":self.ner_tags} return json.dumps(_dict) @staticmethod def fromJson(_json): _dict = json.loads(_json) return Sentences(_dict.get("doc_id"),_dict.get("sentence_index"),_dict.get("sentence_text"),_dict.get("tokens"), _dict.get("pos_tags"),_dict.get("ner_tags")) class Outline(): ''' @summary:根据正则划分公告的大纲 ''' def __init__(self, doc_id,outline_index,outline_text, sentence_begin_index, sentence_end_index,wordOffset_begin,wordOffset_end): ''' @param: doc_id:文章的uuid sentence_begin_index:开始的句子索引 sentence_end_index:结束的句子索引 wordOffset_begin:开始句子的起始字索引 wordOffset_end:结束句子的结尾字索引 outline_text:大纲的全文本 outline_summary:大纲的概要 ''' self.doc_id = doc_id self.outline_index = outline_index self.outline_text = outline_text self.sentence_begin_index = sentence_begin_index self.sentence_end_index = sentence_end_index self.wordOffset_begin = wordOffset_begin self.wordOffset_end = wordOffset_end self.outline_summary = "" class Entity(): ''' @summary:实体类 ''' def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False, prob=0): ''' @param: doc_id:文章的uuid entity_id:相同实体类型的实体的唯一值,由于抽取算法不唯一,不同类型的实体可能拥有相同entity_id entity_text:实体的内容 entity_type:实体类型 sentence_index:句子下标 begin_index:实体所在句子的开始位置 end_index:实体所在句子的结束位置 label:实体所属类别 value:实体的各类别概率值 ''' self.doc_id = doc_id self.entity_id = entity_id self.entity_text = entity_text self.entity_type = entity_type self.sentence_index = sentence_index self.begin_index = begin_index self.end_index = end_index self.wordOffset_begin = wordOffset_begin self.wordOffset_end = wordOffset_end self.label = label self.values = values self.handlabel = True self.packageName = "Project" self.packageCode = "" self.roleName = "" self.linked_entitys = [] self.pointer_pack = None self.pointer_money = None # self.pointer_person = None self.pointer_person = [] self.pointer_address = None self.pointer_tendereeMoney = None # self.person_phone = person_phone self.person_phone = [] self.pointer_email = None self.is_tail = False self.notes = '' # 2021/7/20 新增,保存金额大小写,单位等备注 self.money_unit = '' # 2021/8/17 新增,保存金额单位 元、万元 、亿元 self.if_dict_match = 0 # 2021/12/21 新增,判断公司实体是否由字典识别得到 self.is_total_money = 0 # 2021/12/29 新增,判断金额是否总价 self.is_unit_money = 0 # 2021/12/29 新增,判断金额是否单价 self.pointer_serviceTime = None # 2022/01/05 新增,中标人对应链接"服务期限(工期)" self.pointer_ratio = None # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率" self.origin_entity_text = '' # 2022/1/5 新增,记录字典替换的原来的实体名 self.in_attachment = in_attachment # 2022/02/10添加,实体是否在附件中 self.prob = prob # 2022/06/20添加,实体的概率 self.ratio_value = None # 2022/10/18 新增费率处理数据,(value,ratio_type) 费率数值,类型 def set_Role(self,role_label,role_values): self.label = int(role_label) self.values = [float(i) for i in role_values] def set_Money(self,money_label,money_values): self.label = int(money_label) self.values = [float(i) for i in money_values] def set_Person(self,person_label,person_values,person_phone): self.label = int(person_label) self.values = [float(i) for i in person_values] self.person_phone = person_phone def toJson(self): _dict = {"doc_id":self.doc_id,"entity_id":self.entity_id,"entity_text":self.entity_text, "entity_type":self.entity_type,"sentence_index":self.sentence_index,"begin_index":self.begin_index, "end_index":self.end_index,"wordOffset_begin":self.wordOffset_begin,"wordOffset_end":self.wordOffset_end, "label":int(self.label) if self.label is not None else None,"values":self.values,"person_phone":self.person_phone} return json.dumps(_dict) @staticmethod def fromJson(_json): _dict = json.loads(_json) return Entity(_dict.get("doc_id"),_dict.get("entity_id"),_dict.get("entity_text"),_dict.get("entity_type"), _dict.get("sentence_index"),_dict.get("begin_index"),_dict.get("end_index"),_dict.get("wordOffset_begin"), _dict.get("wordOffset_end"),_dict.get("label"),_dict.get("values"),_dict.get("person_phone")) class PREM(): ''' @summary:包-标段号-角色-公司实体-金额-金额概率-联系人-联系人概率-联系电话 ''' def __init__(self,packageName,packageCode,role_name,entity_text,role_prob,money,money_prob,linklist): ''' @param: packageName:包名 packageCode:标段号 role_name:角色名称 entity_text:公司实体名称 role_prob:角色概率 money:金额 money_prob:金额概率 linklist:联系list[联系人,联系电话] ''' self.packageName = packageName self.packageCode = packageCode self.role_name = role_name self.entity_text = entity_text self.role_prob = role_prob self.money = money self.money_prob = money_prob self.linklist = linklist self.multi_winner = set() # 2024/4/8 #添加多中标人 def getString(self,roleList): ''' #不再在这里解决冲突 count = 0 for item in roleList: if item.entity_text==self.entity_text: if item.packageName==self.packageName: count += 1 else: if "Project" in [item.packageName,self.packageName]: count += 1 if count==1: self.linklist = [item for item in set(self.linklist)] result = [self.packageName,self.packageCode,self.role_name,self.entity_text,self.money,self.linklist] else: result = None ''' self.linklist = [item for item in set(self.linklist)] result = [self.packageName,self.packageCode,self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist] return result class Role(): ''' @summary: 定义一个角色拥有的所有属性 ''' def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist, multi_winner): self.role_name = role_name self.entity_text = entity_text self.role_prob = role_prob self.money = money self.money_prob = money_prob self.linklist = linklist self.money_unit = '' # 2021/8/17 新增 保存金额单位 # 中投标人属性 self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type) self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期) self.address = "" #2022/08/08 新增 角色地址 self.multi_winner = multi_winner #2024/4/8 新增多中标人 def getString(self): self.linklist = [item for item in set(self.linklist)] # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist] # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit] # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime] floating_ratio = "" # 上浮率 downward_floating_ratio = "" # 下浮率 discount_ratio = "" # 折扣率/费率 if self.ratio: # num_value = re.search("\d+(?:\.\d+)?",self.ratio).group() # num_value = float(num_value) # _decimal = str(num_value).split('.')[1] # if _decimal=='0': # round_len = 0 # else: # round_len = len(_decimal) # if re.search("%|百分之",self.ratio): # num_value = num_value * 0.01 # round_len += 2 # elif re.search("‰|千分之",self.ratio): # num_value = num_value * 0.001 # round_len += 3 # num_value = str(round(num_value,round_len)) # # if re.search("上浮",self.ratio): # floating_ratio = num_value # elif re.search("下浮",self.ratio): # downward_floating_ratio = num_value # else: # discount_ratio = num_value ratio_type = self.ratio[1] ratio_value = str(self.ratio[0]) if ratio_type=='floating_ratio': floating_ratio = ratio_value elif ratio_type=='downward_floating_ratio': downward_floating_ratio = ratio_value elif ratio_type=='discount_ratio': discount_ratio = ratio_value result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text), 'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio}, 'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address} if result['role_name'] == 'tenderee': result['role_prob'] = self.role_prob if result['role_name'] == 'win_tenderer' and self.multi_winner != set(): self.multi_winner.add(result['role_text']) result['multi_winner'] = ','.join(self.multi_winner) return result # 用于KM算法的组合配对 class Match(): def __init__(self,main_role,attribute,value=None): ''' :param main_role: 主要的角色 :param attribute: 角色的属性 :param value: 相匹配的权值 ''' self.main_role = main_role self.attribute = attribute self.value = value if __name__=="__main__": a = Article(1,[0.0026275085, 9.795774e-05, 0.00066399743, 0.99661046],"2","4","5") b = Article.fromJson(a.toJson()) print(b.toJson())