Entitys.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. '''
  2. Created on 2018年12月29日
  3. @author: User
  4. '''
  5. from BiddingKG.dl.common.Utils import *
  6. import json
  7. from bs4 import BeautifulSoup
  8. import re
  9. class RelationsTree():
  10. '''
  11. @summary: make a attribute tree
  12. '''
  13. def __init__(self):
  14. self.tree = dict()
  15. self.nodes = dict()
  16. self.nodes["ROOT"] = self.tree
  17. def add_relation(self,relation,parent,child):
  18. if parent in self.nodes:
  19. _relation = relation+"_"+str(parent)+str(child)
  20. self.nodes[parent][_relation] = dict()
  21. self.nodes[child] = self.nodes[parent][_relation]
  22. pattern_attachment = re.compile("\.(?P<attachment>jpg|jpeg|png|swf|tif|pdf|doc|docx|xls|xlsx|zip|rar|tar|7z|wim)$")
  23. class Article():
  24. '''
  25. @summary:文章类
  26. '''
  27. def __init__(self,id,content,sourceContent,doc_id,title,code="",name=""):
  28. '''
  29. @param:
  30. id:文章的uuid
  31. content:文章经过预处理之后的文本
  32. '''
  33. self.id = id
  34. self.content = content
  35. self.code = code
  36. self.name = name
  37. self.sourceContent = sourceContent
  38. self.doc_id = doc_id
  39. self.title = title
  40. self.fingerprint = ""
  41. self.match_enterprise = []
  42. self.match_enterprise_type = 0
  43. self.attachmentTypes = self.getAttachmentTypes(sourceContent)
  44. def toJson(self):
  45. _dict = {"id":self.id,"content":self.content,"code":self.code,
  46. "name":self.name,"sourceContent":"self.sourceContent","doc_id":self.doc_id,"title":self.title}
  47. return json.dumps(_dict)
  48. @staticmethod
  49. def fromJson(_json):
  50. _dict = json.loads(_json)
  51. return Article(_dict.get("id"),_dict.get("content"),
  52. _dict.get("sourceContent"),_dict.get("doc_id"),_dict.get("title"),_dict.get("code"),_dict.get("name"))
  53. @staticmethod
  54. def getAttachmentTypeFromUrl(url):
  55. _match = re.search(pattern_attachment,url)
  56. if _match is not None:
  57. return _match.groupdict().get("attachment")
  58. return None
  59. @staticmethod
  60. def getAttachmentTypes(sourceHtml):
  61. _soup = BeautifulSoup(sourceHtml,"lxml")
  62. set_types = set()
  63. list_a = _soup.find_all("a")
  64. for _a in list_a:
  65. _url = _a.attrs.get("href","")
  66. _type = Article.getAttachmentTypeFromUrl(_url)
  67. if _type is not None:
  68. set_types.add(_type)
  69. list_img = _soup.find_all("img")
  70. for _img in list_img:
  71. _url = _img.attrs.get("src","")
  72. _type = Article.getAttachmentTypeFromUrl(_url)
  73. if _type is not None:
  74. set_types.add(_type)
  75. return ",".join(list(set_types))
  76. class Sentences():
  77. '''
  78. @summary:句子类
  79. '''
  80. def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags):
  81. '''
  82. @param:
  83. doc_id:文章的uuid
  84. sentence_index:文章的句子编号
  85. sentence_text:句子内容
  86. tokens:句子分词
  87. pos_tags:词性标注(算法目前没有用到,暂为空)
  88. ner_tags:实体识别
  89. '''
  90. self.doc_id = doc_id
  91. self.sentence_index = sentence_index
  92. self.sentence_text = sentence_text
  93. self.tokens = tokens
  94. self.pos_tags = pos_tags
  95. self.ner_tags = ner_tags
  96. def toJson(self):
  97. _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text,
  98. "tokens":self.tokens,"pos_tags":self.pos_tags,"ner_tags":self.ner_tags}
  99. return json.dumps(_dict)
  100. @staticmethod
  101. def fromJson(_json):
  102. _dict = json.loads(_json)
  103. return Sentences(_dict.get("doc_id"),_dict.get("sentence_index"),_dict.get("sentence_text"),_dict.get("tokens"),
  104. _dict.get("pos_tags"),_dict.get("ner_tags"))
  105. class Entity():
  106. '''
  107. @summary:实体类
  108. '''
  109. def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None):
  110. '''
  111. @param:
  112. doc_id:文章的uuid
  113. entity_id:相同实体类型的实体的唯一值,由于抽取算法不唯一,不同类型的实体可能拥有相同entity_id
  114. entity_text:实体的内容
  115. entity_type:实体类型
  116. sentence_index:句子下标
  117. begin_index:实体所在句子的开始位置
  118. end_index:实体所在句子的结束位置
  119. label:实体所属类别
  120. value:实体的各类别概率值
  121. '''
  122. self.doc_id = doc_id
  123. self.entity_id = entity_id
  124. self.entity_text = entity_text
  125. self.entity_type = entity_type
  126. self.sentence_index = sentence_index
  127. self.begin_index = begin_index
  128. self.end_index = end_index
  129. self.wordOffset_begin = wordOffset_begin
  130. self.wordOffset_end = wordOffset_end
  131. self.label = label
  132. self.values = values
  133. self.handlabel = True
  134. self.packageName = "Project"
  135. self.packageCode = ""
  136. self.roleName = ""
  137. self.linked_entitys = []
  138. self.pointer_pack = None
  139. self.pointer_money = None
  140. # self.pointer_person = None
  141. self.pointer_person = []
  142. self.pointer_address = None
  143. self.pointer_tendereeMoney = None
  144. # self.person_phone = person_phone
  145. self.person_phone = []
  146. self.is_tail = False
  147. self.person_phone = person_phone
  148. self.notes = '' # 2021/7/20 新增,保存金额大小写,单位等备注
  149. self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
  150. def set_Role(self,role_label,role_values):
  151. self.label = int(role_label)
  152. self.values = [float(i) for i in role_values]
  153. def set_Money(self,money_label,money_values):
  154. self.label = int(money_label)
  155. self.values = [float(i) for i in money_values]
  156. def set_Person(self,person_label,person_values,person_phone):
  157. self.label = int(person_label)
  158. self.values = [float(i) for i in person_values]
  159. self.person_phone = person_phone
  160. def toJson(self):
  161. _dict = {"doc_id":self.doc_id,"entity_id":self.entity_id,"entity_text":self.entity_text,
  162. "entity_type":self.entity_type,"sentence_index":self.sentence_index,"begin_index":self.begin_index,
  163. "end_index":self.end_index,"wordOffset_begin":self.wordOffset_begin,"wordOffset_end":self.wordOffset_end,
  164. "label":int(self.label) if self.label is not None else None,"values":self.values,"person_phone":self.person_phone}
  165. return json.dumps(_dict)
  166. @staticmethod
  167. def fromJson(_json):
  168. _dict = json.loads(_json)
  169. return Entity(_dict.get("doc_id"),_dict.get("entity_id"),_dict.get("entity_text"),_dict.get("entity_type"),
  170. _dict.get("sentence_index"),_dict.get("begin_index"),_dict.get("end_index"),_dict.get("wordOffset_begin"),
  171. _dict.get("wordOffset_end"),_dict.get("label"),_dict.get("values"),_dict.get("person_phone"))
  172. class PREM():
  173. '''
  174. @summary:包-标段号-角色-公司实体-金额-金额概率-联系人-联系人概率-联系电话
  175. '''
  176. def __init__(self,packageName,packageCode,role_name,entity_text,role_prob,money,money_prob,linklist):
  177. '''
  178. @param:
  179. packageName:包名
  180. packageCode:标段号
  181. role_name:角色名称
  182. entity_text:公司实体名称
  183. role_prob:角色概率
  184. money:金额
  185. money_prob:金额概率
  186. linklist:联系list[联系人,联系电话]
  187. '''
  188. self.packageName = packageName
  189. self.packageCode = packageCode
  190. self.role_name = role_name
  191. self.entity_text = entity_text
  192. self.role_prob = role_prob
  193. self.money = money
  194. self.money_prob = money_prob
  195. self.linklist = linklist
  196. def getString(self,roleList):
  197. '''
  198. #不再在这里解决冲突
  199. count = 0
  200. for item in roleList:
  201. if item.entity_text==self.entity_text:
  202. if item.packageName==self.packageName:
  203. count += 1
  204. else:
  205. if "Project" in [item.packageName,self.packageName]:
  206. count += 1
  207. if count==1:
  208. self.linklist = [item for item in set(self.linklist)]
  209. result = [self.packageName,self.packageCode,self.role_name,self.entity_text,self.money,self.linklist]
  210. else:
  211. result = None
  212. '''
  213. self.linklist = [item for item in set(self.linklist)]
  214. result = [self.packageName,self.packageCode,self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
  215. return result
  216. class Role():
  217. '''
  218. @summary: 定义一个角色拥有的所有属性
  219. '''
  220. def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist):
  221. self.role_name = role_name
  222. self.entity_text = entity_text
  223. self.role_prob = role_prob
  224. self.money = money
  225. self.money_prob = money_prob
  226. self.linklist = linklist
  227. self.money_unit = '' # 2021/8/17 新增 保存金额单位
  228. def getString(self):
  229. self.linklist = [item for item in set(self.linklist)]
  230. # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
  231. result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
  232. return result
  233. # 用于KM算法的组合配对
  234. class Match():
  235. def __init__(self,main_role,attribute,value=None):
  236. '''
  237. :param main_role: 主要的角色
  238. :param attribute: 角色的属性
  239. :param value: 相匹配的权值
  240. '''
  241. self.main_role = main_role
  242. self.attribute = attribute
  243. self.value = value
  244. if __name__=="__main__":
  245. a = Article(1,[0.0026275085, 9.795774e-05, 0.00066399743, 0.99661046],"2","4","5")
  246. b = Article.fromJson(a.toJson())
  247. print(b.toJson())