12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- #from transform import *
- import re
- import fool
- from commonutil import *
- @tsv_extractor
- @returns(lambda
- doc_id = "text",
- sentence_index = "int",
- sentence_text = "text",
- tokens = "text[]",
- lemmas = "text[]",
- pos_tag = "text[]",
- ner_tag = "text[]",
- doc_offsets = "int[]",
- dep_types = "text[]",
- dep_tokens = "int[]"
- :[])
- def extract(
- doc_id = "text",
- content ="text",
- ):
- if content is not None and len(content)!=0:
- log("doc_id====="+str(doc_id))
- split_patten = "。"
- sentences = re.split(split_patten,content)
- sentences = [x for x in sentences if len(x)!=0]
-
- lemmas = []
- doc_offsets = []
- dep_types = []
- dep_tokens = []
-
-
- '''
- tokens_all = fool.cut(sentences)
- ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
-
- for sentence_index in range(len(tokens_all)):
- sentence_text = sentences[sentence_index]
- tokens = tokens_all[sentence_index]
- pos_tag = []
- for i in zip(*fool.pos_cut(sentence_text)):
- pos_tag.append(i[0][1])
- ner_tag = ner_tag_all[sentence_index]
- yield[
- doc_id,
- sentence_index,
- sentence_text,
- tokens,
- lemmas,
- pos_tag,
- ner_tag,
- doc_offsets,
- dep_types,
- dep_tokens,]
- '''
- tokens_all = fool.cut(sentences)
- pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
- ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
-
- for sentence_index in range(len(sentences)):
- sentence_text = sentences[sentence_index]
- tokens = tokens_all[sentence_index]
- pos_tag = pos_all[sentence_index]
- ner_tag = ner_tag_all[sentence_index]
- yield[
- doc_id,
- sentence_index,
- sentence_text,
- tokens,
- lemmas,
- pos_tag,
- ner_tag,
- doc_offsets,
- dep_types,
- dep_tokens,]
|