123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- import re
- from commonutil import *
- @tsv_extractor
- @returns(lambda
- entity_id = "text",
- entity_text = "text",
- entity_type = "text",
- doc_id = "text",
- sentence_index = "int",
- begin_index = "int",
- end_index = "int",
- :[])
- def extract(
- doc_id = "text",
- sentence_index = "int",
- tokens = "text[]",
- pos_tags = "text[]",
- ner_tags = "text[]",
- ):
- """
- Finds phrases that are continuous words tagged with company.
- """
- #log(doc_id)
- TYPE_MENTION = frozenset(["org","company","location","person","time"])
- #以下两种接连出现时合为一个实体
- TYPE_COMBINE = frozenset(["org","company"])
-
- num_tokens = len(ner_tags)
- # find all first indexes of series of tokens tagged as company
- first_indexes = (i for i in range(num_tokens) if len(TYPE_MENTION.intersection([ner_tags[i]]))>0 and (i == 0 or len(TYPE_MENTION.intersection([ner_tags[i-1]]))<=0 or (len(TYPE_COMBINE.intersection(ner_tags[i-1:i+1]))<2 and ner_tags[i-1]!=ner_tags[i])) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None)
- for begin_index in first_indexes:
- # find the end of the company phrase (consecutive tokens tagged as company)
- end_index = begin_index + 1
- temp_end = end_index+1
- while end_index < num_tokens and ((ner_tags[end_index] == ner_tags[end_index-1]) or (len(TYPE_COMBINE.intersection(ner_tags[end_index-1:temp_end]))==2)) :
- end_index += 1
- temp_end = end_index+1
- if temp_end==num_tokens:
- temp_end = -1
- end_index -= 1
- # generate a mention identifier
- entity_type = ner_tags[end_index]
- entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
- #entity_text = "".join(map(lambda i: tokens[i] if re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None else '', range(begin_index, end_index + 1)))
- entity_text = "".join(tokens[begin_index:end_index+1])
-
-
- if end_index - begin_index >= 25:
- continue
- # Output a tuple for each company phrase
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index,
- ]
-
-
- '''
- #使用词性识别金额
- str_pos_tags = ""
- for i in range(len(pos_tags)):
- str_pos_tags += pos_tags[i]+str(i)
- entity_type = "money"
- money_pattern = re.compile("(?:\d+)(m\d+q\d+)")
- for item in re.findall(money_pattern,str_pos_tags):
- begin_index = int(item.split("q")[0][1:])
- end_index = int(item.split("q")[1])
- entity_text = str(getUnifyMoney("".join(tokens[begin_index:end_index+1])))
- if tokens[end_index]=="元" and len(entity_text)>3:
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
-
-
- '''
- #使用正则识别金额
- entity_type = "money"
- list_tokenbegin = []
- begin = 0
- for i in range(0,len(tokens)):
- list_tokenbegin.append(begin)
- begin += len(str(tokens[i]))
- list_tokenbegin.append(begin+1)
- money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?))*"
- #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?|价.{,10}?|元.{,10}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?))*"
- money_patten = re.compile(money_patten_str)
- money_patten_all = re.compile("^"+money_patten_str+"$")
- all_match = re.findall(money_patten,"".join(tokens))
- index = 0
- for i in range(len(all_match)):
- if len(all_match[i][0])>0:
- unit = ""
- if len(all_match[i][1])>0:
- entity_text = all_match[i][1]
- elif len(all_match[i][2])>0:
- entity_text = all_match[i][2]
- else:
- entity_text = all_match[i][4]
- unit = all_match[i][3]
- #index += len(all_match[i][0])-len(entity_text)#整个提出来的作为实体
- #entity_text = getUnifyMoney(all_match[i])
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j]==index:
- begin_index = j
- break
- elif list_tokenbegin[j]>index:
- begin_index = j-1
- break
- #index += len(str(entity_text))#整个提出来的作为实体
- index += len(str(all_match[i][0]))
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j]>=index:
- end_index = j-1
- break
- if re.search(money_patten_all,"".join(tokens[begin_index:end_index+1])) is not None:
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- if len(unit)>0:
- entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit))
- else:
- entity_text = str(getUnifyMoney(entity_text))
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
- else:
- index += 1
-
- '''
- #使用正则识别日期
- entity_type = "RegularTime"
- time_pattern_str = "([\d,]+[年/]\s*[,,-]?[\d,]+[月/]\s*[,,-]?(?:[\d,]+日?)?\s*[,,]?(?:\s*[,,]?(?:\d+[:时点])?(?:\d+[:分]?)?(?:\d+秒?)?)?)*"
- time_pattern = re.compile(time_pattern_str)
- time_pattern_all = re.compile("^"+time_pattern_str+"$")
- all_match = re.findall(time_pattern,"".join(tokens))
- index = 0
- for match_i in range(len(all_match)):
- if len(all_match[match_i])>0:
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j]==index:
- begin_index = j
- break
- elif list_tokenbegin[j]>index:
- begin_index = j-1
- break
- #index += len(str(entity_text))#整个提出来的作为实体
- index += len(str(all_match[match_i]))
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j]>=index:
- end_index = j-1
- break
- if re.search(time_pattern_all,"".join(tokens[begin_index:end_index+1])) is not None:
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- entity_text = re.sub(re.compile("[\s,,]*"),"",all_match[match_i])
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
- else:
- index += 1
-
-
- '''
- '''
- #识别联系电话
-
- str_pos_tags = ""
- for i in range(len(pos_tags)):
- str_pos_tags += pos_tags[i]+str(i)
-
- PERSION_MENTION = frozenset(["nr","nr1"])
- entity_type = "call"
- link_patten = re.compile("电话|联系|联系方式|手机")
- call_patten = re.compile("(m\d+(?:wp\d+m\d+){1,2})")
- match_patten = re.compile("^\d+(?:[--]+\d+){1,2}$")
- for item in re.findall(call_patten,str_pos_tags):
- begin_index = int(item.split("wp")[0][1:])
- end_index = int(item.split("wp")[-1].split("m")[1])
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- entity_text = "".join(tokens[begin_index:end_index+1])
- if re.search(match_patten,entity_text) is not None:
- if begin_index>5:
- word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index]))
- pos_infront = pos_tags[begin_index-5:begin_index]
- else:
- word_infront = re.sub("\s+","","".join(tokens[0:begin_index]))
- pos_infront = pos_tags[0:begin_index]
- if re.search(link_patten,word_infront) is not None:
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
- elif len(PERSION_MENTION.intersection(pos_infront))>0:
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
-
- call_patten = re.compile("m\d+")
- match_patten = re.compile("(^\d{7,8}$|^1\d{10}$)")
- for item in re.findall(call_patten,str_pos_tags):
- begin_index = int(item[1:])
- end_index = begin_index
- entity_text = tokens[begin_index]
- if re.match(match_patten,entity_text) is not None:
- if begin_index>5:
- word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index]))
- pos_infront = pos_tags[begin_index-5:begin_index]
- else:
- word_infront = re.sub("\s+","","".join(tokens[0:begin_index]))
- pos_infront = pos_tags[0:begin_index]
- if re.search(link_patten,word_infront) is not None:
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
-
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
- elif len(PERSION_MENTION.intersection(pos_infront))>0:
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- yield [
- entity_id,
- entity_text,
- entity_type,
- doc_id,
- sentence_index,
- begin_index,
- end_index
- ]
- '''
|