Entitys.py 15 KB


  1. '''
  2. Created on 2018年12月29日
  3. @author: User
  4. '''
  5. from BiddingKG.dl.common.Utils import *
  6. import json
  7. from bs4 import BeautifulSoup
  8. import re
  9. class RelationsTree():
  10. '''
  11. @summary: make a attribute tree
  12. '''
  13. def __init__(self):
  14. self.tree = dict()
  15. self.nodes = dict()
  16. self.nodes["ROOT"] = self.tree
  17. def add_relation(self,relation,parent,child):
  18. if parent in self.nodes:
  19. _relation = relation+"_"+str(parent)+str(child)
  20. self.nodes[parent][_relation] = dict()
  21. self.nodes[child] = self.nodes[parent][_relation]
  22. pattern_attachment = re.compile("\.(?P<attachment>jpg|jpeg|png|swf|tif|pdf|doc|docx|xls|xlsx|zip|rar|tar|7z|wim)$")
  23. class Article():
  24. '''
  25. @summary:文章类
  26. '''
  27. def __init__(self,id,content,sourceContent,doc_id,title,code="",name="",bidway=""):
  28. '''
  29. @param:
  30. id:文章的uuid
  31. content:文章经过预处理之后的文本
  32. '''
  33. self.id = id
  34. self.content = content
  35. self.code = code
  36. self.name = name
  37. self.sourceContent = sourceContent
  38. self.doc_id = doc_id
  39. self.title = title
  40. self.fingerprint = ""
  41. self.match_enterprise = []
  42. self.match_enterprise_type = 0
  43. self.attachmentTypes = self.getAttachmentTypes(sourceContent)
  44. self.bidway = bidway
  45. def toJson(self):
  46. _dict = {"id":self.id,"content":self.content,"code":self.code,
  47. "name":self.name,"sourceContent":"self.sourceContent","doc_id":self.doc_id,"title":self.title}
  48. return json.dumps(_dict)
  49. @staticmethod
  50. def fromJson(_json):
  51. _dict = json.loads(_json)
  52. return Article(_dict.get("id"),_dict.get("content"),
  53. _dict.get("sourceContent"),_dict.get("doc_id"),_dict.get("title"),_dict.get("code"),_dict.get("name"))
  54. @staticmethod
  55. def getAttachmentTypeFromUrl(url):
  56. _match = re.search(pattern_attachment,url)
  57. if _match is not None:
  58. return _match.groupdict().get("attachment")
  59. return None
  60. @staticmethod
  61. def getAttachmentTypes(sourceHtml):
  62. _soup = BeautifulSoup(sourceHtml,"lxml")
  63. set_types = set()
  64. list_a = _soup.find_all("a")
  65. for _a in list_a:
  66. _url = _a.attrs.get("href","")
  67. _type = Article.getAttachmentTypeFromUrl(_url)
  68. if _type is not None:
  69. set_types.add(_type)
  70. list_img = _soup.find_all("img")
  71. for _img in list_img:
  72. _url = _img.attrs.get("src","")
  73. _type = Article.getAttachmentTypeFromUrl(_url)
  74. if _type is not None:
  75. set_types.add(_type)
  76. return ",".join(list(set_types))
  77. class Sentences():
  78. '''
  79. @summary:句子类
  80. '''
  81. def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags,in_attachment=False):
  82. '''
  83. @param:
  84. doc_id:文章的uuid
  85. sentence_index:文章的句子编号
  86. sentence_text:句子内容
  87. tokens:句子分词
  88. pos_tags:词性标注(算法目前没有用到,暂为空)
  89. ner_tags:实体识别
  90. '''
  91. self.doc_id = doc_id
  92. self.sentence_index = sentence_index
  93. self.sentence_text = sentence_text
  94. self.tokens = tokens
  95. self.pos_tags = pos_tags
  96. self.ner_tags = ner_tags
  97. self.in_attachment = in_attachment # 2022/02/10添加,句子是否在附件中
  98. def toJson(self):
  99. _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text,
  100. "tokens":self.tokens,"pos_tags":self.pos_tags,"ner_tags":self.ner_tags}
  101. return json.dumps(_dict)
  102. @staticmethod
  103. def fromJson(_json):
  104. _dict = json.loads(_json)
  105. return Sentences(_dict.get("doc_id"),_dict.get("sentence_index"),_dict.get("sentence_text"),_dict.get("tokens"),
  106. _dict.get("pos_tags"),_dict.get("ner_tags"))
  107. class Outline():
  108. '''
  109. @summary:根据正则划分公告的大纲
  110. '''
  111. def __init__(self, doc_id,outline_index,outline_text, sentence_begin_index, sentence_end_index,wordOffset_begin,wordOffset_end):
  112. '''
  113. @param:
  114. doc_id:文章的uuid
  115. sentence_begin_index:开始的句子索引
  116. sentence_end_index:结束的句子索引
  117. wordOffset_begin:开始句子的起始字索引
  118. wordOffset_end:结束句子的结尾字索引
  119. outline_text:大纲的全文本
  120. outline_summary:大纲的概要
  121. '''
  122. self.doc_id = doc_id
  123. self.outline_index = outline_index
  124. self.outline_text = outline_text
  125. self.sentence_begin_index = sentence_begin_index
  126. self.sentence_end_index = sentence_end_index
  127. self.wordOffset_begin = wordOffset_begin
  128. self.wordOffset_end = wordOffset_end
  129. self.outline_summary = ""
  130. class Entity():
  131. '''
  132. @summary:实体类
  133. '''
  134. def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False, prob=0):
  135. '''
  136. @param:
  137. doc_id:文章的uuid
  138. entity_id:相同实体类型的实体的唯一值,由于抽取算法不唯一,不同类型的实体可能拥有相同entity_id
  139. entity_text:实体的内容
  140. entity_type:实体类型
  141. sentence_index:句子下标
  142. begin_index:实体所在句子的开始位置
  143. end_index:实体所在句子的结束位置
  144. label:实体所属类别
  145. value:实体的各类别概率值
  146. '''
  147. self.doc_id = doc_id
  148. self.entity_id = entity_id
  149. self.entity_text = entity_text
  150. self.entity_type = entity_type
  151. self.sentence_index = sentence_index
  152. self.begin_index = begin_index
  153. self.end_index = end_index
  154. self.wordOffset_begin = wordOffset_begin
  155. self.wordOffset_end = wordOffset_end
  156. self.label = label
  157. self.values = values
  158. self.handlabel = True
  159. self.packageName = "Project"
  160. self.packageCode = ""
  161. self.roleName = ""
  162. self.linked_entitys = []
  163. self.pointer_pack = None
  164. self.pointer_money = None
  165. # self.pointer_person = None
  166. self.pointer_person = []
  167. self.pointer_address = None
  168. self.pointer_tendereeMoney = None
  169. # self.person_phone = person_phone
  170. self.person_phone = []
  171. self.pointer_email = None
  172. self.is_tail = False
  173. self.notes = '' # 2021/7/20 新增,保存金额大小写,单位等备注
  174. self.money_unit = '' # 2021/8/17 新增,保存金额单位 元、万元 、亿元
  175. self.if_dict_match = 0 # 2021/12/21 新增,判断公司实体是否由字典识别得到
  176. self.is_total_money = 0 # 2021/12/29 新增,判断金额是否总价
  177. self.is_unit_money = 0 # 2021/12/29 新增,判断金额是否单价
  178. self.pointer_serviceTime = None # 2022/01/05 新增,中标人对应链接"服务期限(工期)"
  179. self.pointer_ratio = None # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率"
  180. self.origin_entity_text = '' # 2022/1/5 新增,记录字典替换的原来的实体名
  181. self.in_attachment = in_attachment # 2022/02/10添加,实体是否在附件中
  182. self.prob = prob # 2022/06/20添加,实体的概率
  183. self.ratio_value = None # 2022/10/18 新增费率处理数据,(value,ratio_type) 费率数值,类型
  184. def set_Role(self,role_label,role_values):
  185. self.label = int(role_label)
  186. self.values = [float(i) for i in role_values]
  187. def set_Money(self,money_label,money_values):
  188. self.label = int(money_label)
  189. self.values = [float(i) for i in money_values]
  190. def set_Person(self,person_label,person_values,person_phone):
  191. self.label = int(person_label)
  192. self.values = [float(i) for i in person_values]
  193. self.person_phone = person_phone
  194. def toJson(self):
  195. _dict = {"doc_id":self.doc_id,"entity_id":self.entity_id,"entity_text":self.entity_text,
  196. "entity_type":self.entity_type,"sentence_index":self.sentence_index,"begin_index":self.begin_index,
  197. "end_index":self.end_index,"wordOffset_begin":self.wordOffset_begin,"wordOffset_end":self.wordOffset_end,
  198. "label":int(self.label) if self.label is not None else None,"values":self.values,"person_phone":self.person_phone}
  199. return json.dumps(_dict)
  200. @staticmethod
  201. def fromJson(_json):
  202. _dict = json.loads(_json)
  203. return Entity(_dict.get("doc_id"),_dict.get("entity_id"),_dict.get("entity_text"),_dict.get("entity_type"),
  204. _dict.get("sentence_index"),_dict.get("begin_index"),_dict.get("end_index"),_dict.get("wordOffset_begin"),
  205. _dict.get("wordOffset_end"),_dict.get("label"),_dict.get("values"),_dict.get("person_phone"))
  206. class PREM():
  207. '''
  208. @summary:包-标段号-角色-公司实体-金额-金额概率-联系人-联系人概率-联系电话
  209. '''
  210. def __init__(self,packageName,packageCode,role_name,entity_text,role_prob,money,money_prob,linklist):
  211. '''
  212. @param:
  213. packageName:包名
  214. packageCode:标段号
  215. role_name:角色名称
  216. entity_text:公司实体名称
  217. role_prob:角色概率
  218. money:金额
  219. money_prob:金额概率
  220. linklist:联系list[联系人,联系电话]
  221. '''
  222. self.packageName = packageName
  223. self.packageCode = packageCode
  224. self.role_name = role_name
  225. self.entity_text = entity_text
  226. self.role_prob = role_prob
  227. self.money = money
  228. self.money_prob = money_prob
  229. self.linklist = linklist
  230. self.multi_winner = set() # 2024/4/8 #添加多中标人
  231. def getString(self,roleList):
  232. '''
  233. #不再在这里解决冲突
  234. count = 0
  235. for item in roleList:
  236. if item.entity_text==self.entity_text:
  237. if item.packageName==self.packageName:
  238. count += 1
  239. else:
  240. if "Project" in [item.packageName,self.packageName]:
  241. count += 1
  242. if count==1:
  243. self.linklist = [item for item in set(self.linklist)]
  244. result = [self.packageName,self.packageCode,self.role_name,self.entity_text,self.money,self.linklist]
  245. else:
  246. result = None
  247. '''
  248. self.linklist = [item for item in set(self.linklist)]
  249. result = [self.packageName,self.packageCode,self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
  250. return result
  251. class Role():
  252. '''
  253. @summary: 定义一个角色拥有的所有属性
  254. '''
  255. def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist, multi_winner):
  256. self.role_name = role_name
  257. self.entity_text = entity_text
  258. self.role_prob = role_prob
  259. self.money = money
  260. self.money_prob = money_prob
  261. self.linklist = linklist
  262. self.money_unit = '' # 2021/8/17 新增 保存金额单位
  263. # 中投标人属性
  264. self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
  265. self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
  266. self.address = "" #2022/08/08 新增 角色地址
  267. self.multi_winner = multi_winner #2024/4/8 新增多中标人
  268. def getString(self):
  269. self.linklist = [item for item in set(self.linklist)]
  270. # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
  271. # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
  272. # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
  273. floating_ratio = "" # 上浮率
  274. downward_floating_ratio = "" # 下浮率
  275. discount_ratio = "" # 折扣率/费率
  276. if self.ratio:
  277. # num_value = re.search("\d+(?:\.\d+)?",self.ratio).group()
  278. # num_value = float(num_value)
  279. # _decimal = str(num_value).split('.')[1]
  280. # if _decimal=='0':
  281. # round_len = 0
  282. # else:
  283. # round_len = len(_decimal)
  284. # if re.search("%|百分之",self.ratio):
  285. # num_value = num_value * 0.01
  286. # round_len += 2
  287. # elif re.search("‰|千分之",self.ratio):
  288. # num_value = num_value * 0.001
  289. # round_len += 3
  290. # num_value = str(round(num_value,round_len))
  291. #
  292. # if re.search("上浮",self.ratio):
  293. # floating_ratio = num_value
  294. # elif re.search("下浮",self.ratio):
  295. # downward_floating_ratio = num_value
  296. # else:
  297. # discount_ratio = num_value
  298. ratio_type = self.ratio[1]
  299. ratio_value = str(self.ratio[0])
  300. if ratio_type=='floating_ratio':
  301. floating_ratio = ratio_value
  302. elif ratio_type=='downward_floating_ratio':
  303. downward_floating_ratio = ratio_value
  304. elif ratio_type=='discount_ratio':
  305. discount_ratio = ratio_value
  306. result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
  307. 'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
  308. 'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
  309. if result['role_name'] in ['tenderee', 'win_tenderer']:
  310. result['role_prob'] = self.role_prob
  311. if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
  312. self.multi_winner.add(result['role_text'])
  313. result['multi_winner'] = ','.join(self.multi_winner)
  314. return result
  315. # 用于KM算法的组合配对
  316. class Match():
  317. def __init__(self,main_role,attribute,value=None):
  318. '''
  319. :param main_role: 主要的角色
  320. :param attribute: 角色的属性
  321. :param value: 相匹配的权值
  322. '''
  323. self.main_role = main_role
  324. self.attribute = attribute
  325. self.value = value
  326. if __name__=="__main__":
  327. a = Article(1,[0.0026275085, 9.795774e-05, 0.00066399743, 0.99661046],"2","4","5")
  328. b = Article.fromJson(a.toJson())
  329. print(b.toJson())