luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
							#!/usr/bin/env python
from deepdive import *
import ddlib

@tsv_extractor
@returns(lambda
        entity_id   = "text",
        feature = "text",
        window_size = "int",
    :[])
def extract(
        entity_id          = "text",
        entity_begin_index = "int",
        entity_end_index   = "int",
        doc_id         = "text",
        sent_index     = "int",
        tokens         = "text[]",
        pos_tags       = "text[]",
        ner_tags       = "text[]",
        window_size     = "int",
    ):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    '''
    sent = []
    for i,t in enumerate(tokens):
        sent.append(ddlib.Word(
            begin_char_offset=None,
            end_char_offset=None,
            word=t,
            lemma=tokens[i],
            pos=pos_tags[i],
            ner=ner_tags[i],
            dep_par=-1 ,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
            dep_label=''))

    # Create DDLIB Spans for the two person mentions
    entity_span = ddlib.Span(begin_word_id=entity_begin_index,length=(entity_end_index-entity_begin_index+1))
    
    for feature in ddlib.get_generic_features_mention(sent, entity_span,window=window_size):
        yield [entity_id, feature,window_size]
    '''
    maxlen = len(tokens)
    words = []
    if entity_begin_index>window_size:
        start = entity_begin_index-window_size
    else:
        start = 0
    for i in tokens[start:entity_begin_index]:
        words.append(i)
    if maxlen-window_size>entity_end_index:
        end = entity_end_index+window_size
    else:
        end = maxlen
    for i in tokens[entity_end_index:end]:
        words.append(i)
    
    for item in words:
        yield[entity_id,item,window_size]