#!/usr/bin/env python from deepdive import * import ddlib @tsv_extractor @returns(lambda entity_id = "text", feature = "text", window_size = "int", :[]) def extract( entity_id = "text", entity_begin_index = "int", entity_end_index = "int", doc_id = "text", sent_index = "int", tokens = "text[]", pos_tags = "text[]", ner_tags = "text[]", window_size = "int", ): """ Uses DDLIB to generate features for the spouse relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects ''' sent = [] for i,t in enumerate(tokens): sent.append(ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=-1 , # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label='')) # Create DDLIB Spans for the two person mentions entity_span = ddlib.Span(begin_word_id=entity_begin_index,length=(entity_end_index-entity_begin_index+1)) for feature in ddlib.get_generic_features_mention(sent, entity_span,window=window_size): yield [entity_id, feature,window_size] ''' maxlen = len(tokens) words = [] if entity_begin_index>window_size: start = entity_begin_index-window_size else: start = 0 for i in tokens[start:entity_begin_index]: words.append(i) if maxlen-window_size>entity_end_index: end = entity_end_index+window_size else: end = maxlen for i in tokens[entity_end_index:end]: words.append(i) for item in words: yield[entity_id,item,window_size]