123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- #!/usr/bin/env python
- from deepdive import *
- import ddlib
- @tsv_extractor
- @returns(lambda
- p1_id = "text",
- p2_id = "text",
- feature = "text",
- window_size = "int",
- :[])
- def extract(
- p1_id = "text",
- p2_id = "text",
- p1_begin_index = "int",
- p1_end_index = "int",
- p2_begin_index = "int",
- p2_end_index = "int",
- doc_id = "text",
- sent_index = "int",
- tokens = "text[]",
- lemmas = "text[]",
- pos_tags = "text[]",
- ner_tags = "text[]",
- dep_types = "text[]",
- dep_parents = "int[]",
- window_size = "int",
- ):
- """
- Uses DDLIB to generate features for the spouse relation.
- """
- # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
- sent = []
- for i,t in enumerate(tokens):
- sent.append(ddlib.Word(
- begin_char_offset=None,
- end_char_offset=None,
- word=t,
- lemma=lemmas[i],
- pos=pos_tags[i],
- ner=ner_tags[i],
- dep_par=-1 , # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
- dep_label=''))
- # Create DDLIB Spans for the two person mentions
- p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index-p1_begin_index+1))
- p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index-p2_begin_index+1))
- # Generate the generic features using DDLIB
- for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span,window=window_size):
- yield [p1_id, p2_id, feature,window_size]
|