map_entity_mention.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import re
  5. from commonutil import *
  6. @tsv_extractor
  7. @returns(lambda
  8. entity_id = "text",
  9. entity_text = "text",
  10. entity_type = "text",
  11. doc_id = "text",
  12. sentence_index = "int",
  13. begin_index = "int",
  14. end_index = "int",
  15. :[])
  16. def extract(
  17. doc_id = "text",
  18. sentence_index = "int",
  19. tokens = "text[]",
  20. pos_tags = "text[]",
  21. ner_tags = "text[]",
  22. ):
  23. """
  24. Finds phrases that are continuous words tagged with company.
  25. """
  26. #log(doc_id)
  27. TYPE_MENTION = frozenset(["org","company","location","person","time"])
  28. #以下两种接连出现时合为一个实体
  29. TYPE_COMBINE = frozenset(["org","company"])
  30. num_tokens = len(ner_tags)
  31. # find all first indexes of series of tokens tagged as company
  32. first_indexes = (i for i in range(num_tokens) if len(TYPE_MENTION.intersection([ner_tags[i]]))>0 and (i == 0 or len(TYPE_MENTION.intersection([ner_tags[i-1]]))<=0 or (len(TYPE_COMBINE.intersection(ner_tags[i-1:i+1]))<2 and ner_tags[i-1]!=ner_tags[i])) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None)
  33. for begin_index in first_indexes:
  34. # find the end of the company phrase (consecutive tokens tagged as company)
  35. end_index = begin_index + 1
  36. temp_end = end_index+1
  37. while end_index < num_tokens and ((ner_tags[end_index] == ner_tags[end_index-1]) or (len(TYPE_COMBINE.intersection(ner_tags[end_index-1:temp_end]))==2)) :
  38. end_index += 1
  39. temp_end = end_index+1
  40. if temp_end==num_tokens:
  41. temp_end = -1
  42. end_index -= 1
  43. # generate a mention identifier
  44. entity_type = ner_tags[end_index]
  45. entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
  46. #entity_text = "".join(map(lambda i: tokens[i] if re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None else '', range(begin_index, end_index + 1)))
  47. entity_text = "".join(tokens[begin_index:end_index+1])
  48. if end_index - begin_index >= 25:
  49. continue
  50. # Output a tuple for each company phrase
  51. yield [
  52. entity_id,
  53. entity_text,
  54. entity_type,
  55. doc_id,
  56. sentence_index,
  57. begin_index,
  58. end_index,
  59. ]
  60. '''
  61. #使用词性识别金额
  62. str_pos_tags = ""
  63. for i in range(len(pos_tags)):
  64. str_pos_tags += pos_tags[i]+str(i)
  65. entity_type = "money"
  66. money_pattern = re.compile("(?:\d+)(m\d+q\d+)")
  67. for item in re.findall(money_pattern,str_pos_tags):
  68. begin_index = int(item.split("q")[0][1:])
  69. end_index = int(item.split("q")[1])
  70. entity_text = str(getUnifyMoney("".join(tokens[begin_index:end_index+1])))
  71. if tokens[end_index]=="元" and len(entity_text)>3:
  72. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  73. yield [
  74. entity_id,
  75. entity_text,
  76. entity_type,
  77. doc_id,
  78. sentence_index,
  79. begin_index,
  80. end_index
  81. ]
  82. '''
  83. #使用正则识别金额
  84. entity_type = "money"
  85. list_tokenbegin = []
  86. begin = 0
  87. for i in range(0,len(tokens)):
  88. list_tokenbegin.append(begin)
  89. begin += len(str(tokens[i]))
  90. list_tokenbegin.append(begin+1)
  91. money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?))*"
  92. #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?|价.{,10}?|元.{,10}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?))*"
  93. money_patten = re.compile(money_patten_str)
  94. money_patten_all = re.compile("^"+money_patten_str+"$")
  95. all_match = re.findall(money_patten,"".join(tokens))
  96. index = 0
  97. for i in range(len(all_match)):
  98. if len(all_match[i][0])>0:
  99. unit = ""
  100. if len(all_match[i][1])>0:
  101. entity_text = all_match[i][1]
  102. elif len(all_match[i][2])>0:
  103. entity_text = all_match[i][2]
  104. else:
  105. entity_text = all_match[i][4]
  106. unit = all_match[i][3]
  107. #index += len(all_match[i][0])-len(entity_text)#整个提出来的作为实体
  108. #entity_text = getUnifyMoney(all_match[i])
  109. for j in range(len(list_tokenbegin)):
  110. if list_tokenbegin[j]==index:
  111. begin_index = j
  112. break
  113. elif list_tokenbegin[j]>index:
  114. begin_index = j-1
  115. break
  116. #index += len(str(entity_text))#整个提出来的作为实体
  117. index += len(str(all_match[i][0]))
  118. for j in range(len(list_tokenbegin)):
  119. if list_tokenbegin[j]>=index:
  120. end_index = j-1
  121. break
  122. if re.search(money_patten_all,"".join(tokens[begin_index:end_index+1])) is not None:
  123. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  124. if len(unit)>0:
  125. entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit))
  126. else:
  127. entity_text = str(getUnifyMoney(entity_text))
  128. yield [
  129. entity_id,
  130. entity_text,
  131. entity_type,
  132. doc_id,
  133. sentence_index,
  134. begin_index,
  135. end_index
  136. ]
  137. else:
  138. index += 1
  139. '''
  140. #使用正则识别日期
  141. entity_type = "RegularTime"
  142. time_pattern_str = "([\d,]+[年/]\s*[,,-]?[\d,]+[月/]\s*[,,-]?(?:[\d,]+日?)?\s*[,,]?(?:\s*[,,]?(?:\d+[:时点])?(?:\d+[:分]?)?(?:\d+秒?)?)?)*"
  143. time_pattern = re.compile(time_pattern_str)
  144. time_pattern_all = re.compile("^"+time_pattern_str+"$")
  145. all_match = re.findall(time_pattern,"".join(tokens))
  146. index = 0
  147. for match_i in range(len(all_match)):
  148. if len(all_match[match_i])>0:
  149. for j in range(len(list_tokenbegin)):
  150. if list_tokenbegin[j]==index:
  151. begin_index = j
  152. break
  153. elif list_tokenbegin[j]>index:
  154. begin_index = j-1
  155. break
  156. #index += len(str(entity_text))#整个提出来的作为实体
  157. index += len(str(all_match[match_i]))
  158. for j in range(len(list_tokenbegin)):
  159. if list_tokenbegin[j]>=index:
  160. end_index = j-1
  161. break
  162. if re.search(time_pattern_all,"".join(tokens[begin_index:end_index+1])) is not None:
  163. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  164. entity_text = re.sub(re.compile("[\s,,]*"),"",all_match[match_i])
  165. yield [
  166. entity_id,
  167. entity_text,
  168. entity_type,
  169. doc_id,
  170. sentence_index,
  171. begin_index,
  172. end_index
  173. ]
  174. else:
  175. index += 1
  176. '''
  177. '''
  178. #识别联系电话
  179. str_pos_tags = ""
  180. for i in range(len(pos_tags)):
  181. str_pos_tags += pos_tags[i]+str(i)
  182. PERSION_MENTION = frozenset(["nr","nr1"])
  183. entity_type = "call"
  184. link_patten = re.compile("电话|联系|联系方式|手机")
  185. call_patten = re.compile("(m\d+(?:wp\d+m\d+){1,2})")
  186. match_patten = re.compile("^\d+(?:[--]+\d+){1,2}$")
  187. for item in re.findall(call_patten,str_pos_tags):
  188. begin_index = int(item.split("wp")[0][1:])
  189. end_index = int(item.split("wp")[-1].split("m")[1])
  190. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  191. entity_text = "".join(tokens[begin_index:end_index+1])
  192. if re.search(match_patten,entity_text) is not None:
  193. if begin_index>5:
  194. word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index]))
  195. pos_infront = pos_tags[begin_index-5:begin_index]
  196. else:
  197. word_infront = re.sub("\s+","","".join(tokens[0:begin_index]))
  198. pos_infront = pos_tags[0:begin_index]
  199. if re.search(link_patten,word_infront) is not None:
  200. yield [
  201. entity_id,
  202. entity_text,
  203. entity_type,
  204. doc_id,
  205. sentence_index,
  206. begin_index,
  207. end_index
  208. ]
  209. elif len(PERSION_MENTION.intersection(pos_infront))>0:
  210. yield [
  211. entity_id,
  212. entity_text,
  213. entity_type,
  214. doc_id,
  215. sentence_index,
  216. begin_index,
  217. end_index
  218. ]
  219. call_patten = re.compile("m\d+")
  220. match_patten = re.compile("(^\d{7,8}$|^1\d{10}$)")
  221. for item in re.findall(call_patten,str_pos_tags):
  222. begin_index = int(item[1:])
  223. end_index = begin_index
  224. entity_text = tokens[begin_index]
  225. if re.match(match_patten,entity_text) is not None:
  226. if begin_index>5:
  227. word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index]))
  228. pos_infront = pos_tags[begin_index-5:begin_index]
  229. else:
  230. word_infront = re.sub("\s+","","".join(tokens[0:begin_index]))
  231. pos_infront = pos_tags[0:begin_index]
  232. if re.search(link_patten,word_infront) is not None:
  233. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  234. yield [
  235. entity_id,
  236. entity_text,
  237. entity_type,
  238. doc_id,
  239. sentence_index,
  240. begin_index,
  241. end_index
  242. ]
  243. elif len(PERSION_MENTION.intersection(pos_infront))>0:
  244. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  245. yield [
  246. entity_id,
  247. entity_text,
  248. entity_type,
  249. doc_id,
  250. sentence_index,
  251. begin_index,
  252. end_index
  253. ]
  254. '''