1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- #!/usr/bin/env python
- from deepdive import *
- import ddlib
- @tsv_extractor
- @returns(lambda
- entity_id = "text",
- feature = "text",
- window_size = "int",
- :[])
- def extract(
- entity_id = "text",
- entity_begin_index = "int",
- entity_end_index = "int",
- doc_id = "text",
- sent_index = "int",
- tokens = "text[]",
- pos_tags = "text[]",
- ner_tags = "text[]",
- window_size = "int",
- ):
- """
- Uses DDLIB to generate features for the spouse relation.
- """
- # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
- '''
- sent = []
- for i,t in enumerate(tokens):
- sent.append(ddlib.Word(
- begin_char_offset=None,
- end_char_offset=None,
- word=t,
- lemma=tokens[i],
- pos=pos_tags[i],
- ner=ner_tags[i],
- dep_par=-1 , # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
- dep_label=''))
- # Create DDLIB Spans for the two person mentions
- entity_span = ddlib.Span(begin_word_id=entity_begin_index,length=(entity_end_index-entity_begin_index+1))
-
- for feature in ddlib.get_generic_features_mention(sent, entity_span,window=window_size):
- yield [entity_id, feature,window_size]
- '''
- maxlen = len(tokens)
- words = []
- if entity_begin_index>window_size:
- start = entity_begin_index-window_size
- else:
- start = 0
- for i in tokens[start:entity_begin_index]:
- words.append(i)
- if maxlen-window_size>entity_end_index:
- end = entity_end_index+window_size
- else:
- end = maxlen
- for i in tokens[entity_end_index:end]:
- words.append(i)
-
- for item in words:
- yield[entity_id,item,window_size]
-
|