#!/usr/bin/env python #encoding:utf-8 from deepdive import * import re from commonutil import * @tsv_extractor @returns(lambda entity_id = "text", entity_text = "text", entity_type = "text", doc_id = "text", sentence_index = "int", begin_index = "int", end_index = "int", :[]) def extract( doc_id = "text", sentence_index = "int", tokens = "text[]", pos_tags = "text[]", ner_tags = "text[]", ): """ Finds phrases that are continuous words tagged with company. """ #log(doc_id) TYPE_MENTION = frozenset(["org","company","location","person","time"]) #以下两种接连出现时合为一个实体 TYPE_COMBINE = frozenset(["org","company"]) num_tokens = len(ner_tags) # find all first indexes of series of tokens tagged as company first_indexes = (i for i in range(num_tokens) if len(TYPE_MENTION.intersection([ner_tags[i]]))>0 and (i == 0 or len(TYPE_MENTION.intersection([ner_tags[i-1]]))<=0 or (len(TYPE_COMBINE.intersection(ner_tags[i-1:i+1]))<2 and ner_tags[i-1]!=ner_tags[i])) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None) for begin_index in first_indexes: # find the end of the company phrase (consecutive tokens tagged as company) end_index = begin_index + 1 temp_end = end_index+1 while end_index < num_tokens and ((ner_tags[end_index] == ner_tags[end_index-1]) or (len(TYPE_COMBINE.intersection(ner_tags[end_index-1:temp_end]))==2)) : end_index += 1 temp_end = end_index+1 if temp_end==num_tokens: temp_end = -1 end_index -= 1 # generate a mention identifier entity_type = ner_tags[end_index] entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index) #entity_text = "".join(map(lambda i: tokens[i] if re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None else '', range(begin_index, end_index + 1))) entity_text = "".join(tokens[begin_index:end_index+1]) if end_index - begin_index >= 25: continue # Output a tuple for each company phrase yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index, ] ''' #使用词性识别金额 str_pos_tags = "" for i in range(len(pos_tags)): str_pos_tags += pos_tags[i]+str(i) entity_type = "money" money_pattern = re.compile("(?:\d+)(m\d+q\d+)") for item in re.findall(money_pattern,str_pos_tags): begin_index = int(item.split("q")[0][1:]) end_index = int(item.split("q")[1]) entity_text = str(getUnifyMoney("".join(tokens[begin_index:end_index+1]))) if tokens[end_index]=="元" and len(entity_text)>3: entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] ''' #使用正则识别金额 entity_type = "money" list_tokenbegin = [] begin = 0 for i in range(0,len(tokens)): list_tokenbegin.append(begin) begin += len(str(tokens[i])) list_tokenbegin.append(begin+1) money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?))*" #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?|价.{,10}?|元.{,10}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?))*" money_patten = re.compile(money_patten_str) money_patten_all = re.compile("^"+money_patten_str+"$") all_match = re.findall(money_patten,"".join(tokens)) index = 0 for i in range(len(all_match)): if len(all_match[i][0])>0: unit = "" if len(all_match[i][1])>0: entity_text = all_match[i][1] elif len(all_match[i][2])>0: entity_text = all_match[i][2] else: entity_text = all_match[i][4] unit = all_match[i][3] #index += len(all_match[i][0])-len(entity_text)#整个提出来的作为实体 #entity_text = getUnifyMoney(all_match[i]) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]==index: begin_index = j break elif list_tokenbegin[j]>index: begin_index = j-1 break #index += len(str(entity_text))#整个提出来的作为实体 index += len(str(all_match[i][0])) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]>=index: end_index = j-1 break if re.search(money_patten_all,"".join(tokens[begin_index:end_index+1])) is not None: entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) if len(unit)>0: entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit)) else: entity_text = str(getUnifyMoney(entity_text)) yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] else: index += 1 ''' #使用正则识别日期 entity_type = "RegularTime" time_pattern_str = "([\d,]+[年/]\s*[,,-]?[\d,]+[月/]\s*[,,-]?(?:[\d,]+日?)?\s*[,,]?(?:\s*[,,]?(?:\d+[:时点])?(?:\d+[:分]?)?(?:\d+秒?)?)?)*" time_pattern = re.compile(time_pattern_str) time_pattern_all = re.compile("^"+time_pattern_str+"$") all_match = re.findall(time_pattern,"".join(tokens)) index = 0 for match_i in range(len(all_match)): if len(all_match[match_i])>0: for j in range(len(list_tokenbegin)): if list_tokenbegin[j]==index: begin_index = j break elif list_tokenbegin[j]>index: begin_index = j-1 break #index += len(str(entity_text))#整个提出来的作为实体 index += len(str(all_match[match_i])) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]>=index: end_index = j-1 break if re.search(time_pattern_all,"".join(tokens[begin_index:end_index+1])) is not None: entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) entity_text = re.sub(re.compile("[\s,,]*"),"",all_match[match_i]) yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] else: index += 1 ''' ''' #识别联系电话 str_pos_tags = "" for i in range(len(pos_tags)): str_pos_tags += pos_tags[i]+str(i) PERSION_MENTION = frozenset(["nr","nr1"]) entity_type = "call" link_patten = re.compile("电话|联系|联系方式|手机") call_patten = re.compile("(m\d+(?:wp\d+m\d+){1,2})") match_patten = re.compile("^\d+(?:[--]+\d+){1,2}$") for item in re.findall(call_patten,str_pos_tags): begin_index = int(item.split("wp")[0][1:]) end_index = int(item.split("wp")[-1].split("m")[1]) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) entity_text = "".join(tokens[begin_index:end_index+1]) if re.search(match_patten,entity_text) is not None: if begin_index>5: word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index])) pos_infront = pos_tags[begin_index-5:begin_index] else: word_infront = re.sub("\s+","","".join(tokens[0:begin_index])) pos_infront = pos_tags[0:begin_index] if re.search(link_patten,word_infront) is not None: yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] elif len(PERSION_MENTION.intersection(pos_infront))>0: yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] call_patten = re.compile("m\d+") match_patten = re.compile("(^\d{7,8}$|^1\d{10}$)") for item in re.findall(call_patten,str_pos_tags): begin_index = int(item[1:]) end_index = begin_index entity_text = tokens[begin_index] if re.match(match_patten,entity_text) is not None: if begin_index>5: word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index])) pos_infront = pos_tags[begin_index-5:begin_index] else: word_infront = re.sub("\s+","","".join(tokens[0:begin_index])) pos_infront = pos_tags[0:begin_index] if re.search(link_patten,word_infront) is not None: entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] elif len(PERSION_MENTION.intersection(pos_infront))>0: entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) yield [ entity_id, entity_text, entity_type, doc_id, sentence_index, begin_index, end_index ] '''