luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
							#!/usr/bin/env python
#encoding:utf-8
from deepdive import *
import re
from commonutil import *

@tsv_extractor
@returns(lambda
        entity_id       = "text",
        entity_text     = "text",
        entity_type     = "text",
        doc_id           = "text",
        sentence_index   = "int",
        begin_index      = "int",
        end_index        = "int",
    :[])
def extract(
        doc_id         = "text",
        sentence_index = "int",
        tokens         = "text[]",
        pos_tags        = "text[]",
        ner_tags       = "text[]",
    ):
    """
    Finds phrases that are continuous words tagged with company.
    """
    #log(doc_id)

    TYPE_MENTION = frozenset(["org","company","location","person","time"])
    #以下两种接连出现时合为一个实体
    TYPE_COMBINE = frozenset(["org","company"])
    
    num_tokens = len(ner_tags)
    # find all first indexes of series of tokens tagged as company
    first_indexes = (i for i in range(num_tokens) if len(TYPE_MENTION.intersection([ner_tags[i]]))>0 and (i == 0 or len(TYPE_MENTION.intersection([ner_tags[i-1]]))<=0 or (len(TYPE_COMBINE.intersection(ner_tags[i-1:i+1]))<2 and ner_tags[i-1]!=ner_tags[i])) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None)
    for begin_index in first_indexes:
        # find the end of the company phrase (consecutive tokens tagged as company)
        end_index = begin_index + 1
        temp_end = end_index+1
        while end_index < num_tokens and ((ner_tags[end_index] == ner_tags[end_index-1]) or (len(TYPE_COMBINE.intersection(ner_tags[end_index-1:temp_end]))==2)) :
            end_index += 1
            temp_end = end_index+1
            if temp_end==num_tokens:
                temp_end = -1
        end_index -= 1
        # generate a mention identifier
        entity_type = ner_tags[end_index]
        entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
        #entity_text = "".join(map(lambda i: tokens[i] if re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9]+$', tokens[i]) != None else '', range(begin_index, end_index + 1)))
        entity_text = "".join(tokens[begin_index:end_index+1])
        
        
        if end_index - begin_index >= 25:
            continue
        # Output a tuple for each company phrase
        yield [
            entity_id,
            entity_text,
            entity_type,
            doc_id,
            sentence_index,
            begin_index,
            end_index,
        ]
    
    
    '''    
    #使用词性识别金额
    str_pos_tags = ""
    for i in range(len(pos_tags)):
        str_pos_tags += pos_tags[i]+str(i)
    entity_type = "money"
    money_pattern = re.compile("(?:\d+)(m\d+q\d+)")
    for item in re.findall(money_pattern,str_pos_tags):
        begin_index = int(item.split("q")[0][1:])
        end_index = int(item.split("q")[1])
        entity_text = str(getUnifyMoney("".join(tokens[begin_index:end_index+1])))
        if tokens[end_index]=="元" and len(entity_text)>3:
            entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
            yield [
                   entity_id,
                   entity_text,
                   entity_type,
                   doc_id,
                   sentence_index,
                   begin_index,
                   end_index
                   ]
    
    
    '''
    #使用正则识别金额
    entity_type = "money"
    list_tokenbegin = []
    begin = 0
    for i in range(0,len(tokens)):
        list_tokenbegin.append(begin)
        begin += len(str(tokens[i]))
    list_tokenbegin.append(begin+1)
    money_patten_str = "(([1-9][\d,，]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[（\(]?([万]?)元[）\)]?[:：]?|[￥¥]+，?)([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]?))*"
    #money_patten_str = "(([1-9][\d,，]*(?:\.\d+)?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[（\(]?([万]?)元[）\)]?[:：]?|[￥¥]+，?|价.{,10}?|元.{,10}?)([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]?))*"
    money_patten = re.compile(money_patten_str)
    money_patten_all = re.compile("^"+money_patten_str+"$")
    all_match = re.findall(money_patten,"".join(tokens))
    index = 0
    for i in range(len(all_match)):
        if len(all_match[i][0])>0:
            unit = ""
            if len(all_match[i][1])>0:
                entity_text = all_match[i][1]
            elif len(all_match[i][2])>0:
                entity_text = all_match[i][2]
            else:
                entity_text = all_match[i][4]
                unit = all_match[i][3]
            #index += len(all_match[i][0])-len(entity_text)#整个提出来的作为实体
            #entity_text = getUnifyMoney(all_match[i])
            for j in range(len(list_tokenbegin)):
                if list_tokenbegin[j]==index:
                    begin_index = j
                    break
                elif list_tokenbegin[j]>index:
                    begin_index = j-1
                    break
            #index += len(str(entity_text))#整个提出来的作为实体
            index += len(str(all_match[i][0]))
            for j in range(len(list_tokenbegin)):
                if list_tokenbegin[j]>=index:
                    end_index = j-1
                    break
            if re.search(money_patten_all,"".join(tokens[begin_index:end_index+1])) is not None:
                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                if len(unit)>0:
                    entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit))
                else:
                    entity_text = str(getUnifyMoney(entity_text))
                yield [
                       entity_id,
                       entity_text,
                       entity_type,
                       doc_id,
                       sentence_index,
                       begin_index,
                       end_index
                       ]
        else:
            index += 1
    
    '''
    #使用正则识别日期
    entity_type = "RegularTime"
    time_pattern_str = "([\d，]+[年/]\s*[,，-]?[\d，]+[月/]\s*[,，-]?(?:[\d，]+日?)?\s*[,，]?(?:\s*[,，]?(?:\d+[:时点])?(?:\d+[:分]?)?(?:\d+秒?)?)?)*"
    time_pattern = re.compile(time_pattern_str)
    time_pattern_all = re.compile("^"+time_pattern_str+"$")
    all_match = re.findall(time_pattern,"".join(tokens))
    index = 0
    for match_i in range(len(all_match)):
        if len(all_match[match_i])>0:
            for j in range(len(list_tokenbegin)):
                if list_tokenbegin[j]==index:
                    begin_index = j
                    break
                elif list_tokenbegin[j]>index:
                    begin_index = j-1
                    break
            #index += len(str(entity_text))#整个提出来的作为实体
            index += len(str(all_match[match_i]))
            for j in range(len(list_tokenbegin)):
                if list_tokenbegin[j]>=index:
                    end_index = j-1
                    break
            if re.search(time_pattern_all,"".join(tokens[begin_index:end_index+1])) is not None:
                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                entity_text = re.sub(re.compile("[\s,，]*"),"",all_match[match_i])
                yield [
                       entity_id,
                       entity_text,
                       entity_type,
                       doc_id,
                       sentence_index,
                       begin_index,
                       end_index
                       ]
        else:
            index += 1
            
    
    '''
    '''
    #识别联系电话
     
    str_pos_tags = ""
    for i in range(len(pos_tags)):
        str_pos_tags += pos_tags[i]+str(i)
    
    PERSION_MENTION = frozenset(["nr","nr1"])
    entity_type = "call"
    link_patten = re.compile("电话|联系|联系方式|手机")
    call_patten = re.compile("(m\d+(?:wp\d+m\d+){1,2})")
    match_patten = re.compile("^\d+(?:[--]+\d+){1,2}$")
    for item in re.findall(call_patten,str_pos_tags):
        begin_index = int(item.split("wp")[0][1:])
        end_index = int(item.split("wp")[-1].split("m")[1])
        entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
        entity_text = "".join(tokens[begin_index:end_index+1])
        if re.search(match_patten,entity_text) is not None:
            if begin_index>5:
                word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index]))
                pos_infront = pos_tags[begin_index-5:begin_index]
            else:
                word_infront = re.sub("\s+","","".join(tokens[0:begin_index]))
                pos_infront = pos_tags[0:begin_index]
            if re.search(link_patten,word_infront) is not None:
                yield [
                   entity_id,
                   entity_text,
                   entity_type,
                   doc_id,
                   sentence_index,
                   begin_index,
                   end_index
                   ]
            elif len(PERSION_MENTION.intersection(pos_infront))>0:
                yield [
                   entity_id,
                   entity_text,
                   entity_type,
                   doc_id,
                   sentence_index,
                   begin_index,
                   end_index
                   ]
                
    call_patten = re.compile("m\d+")
    match_patten = re.compile("(^\d{7,8}$|^1\d{10}$)")
    for item in re.findall(call_patten,str_pos_tags):
        begin_index = int(item[1:])
        end_index = begin_index
        entity_text = tokens[begin_index]
        if re.match(match_patten,entity_text) is not None:
            if begin_index>5:
                word_infront = re.sub("\s+","","".join(tokens[begin_index-5:begin_index]))
                pos_infront = pos_tags[begin_index-5:begin_index]
            else:
                word_infront = re.sub("\s+","","".join(tokens[0:begin_index]))
                pos_infront = pos_tags[0:begin_index]
            if re.search(link_patten,word_infront) is not None:
                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                
                yield [
                   entity_id,
                   entity_text,
                   entity_type,
                   doc_id,
                   sentence_index,
                   begin_index,
                   end_index
                   ]
            elif len(PERSION_MENTION.intersection(pos_infront))>0:
                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                yield [
                   entity_id,
                   entity_text,
                   entity_type,
                   doc_id,
                   sentence_index,
                   begin_index,
                   end_index
                   ]
    '''