#!/usr/bin/env python #encoding:utf-8 from deepdive import * #from transform import * import re import fool from commonutil import * @tsv_extractor @returns(lambda doc_id = "text", sentence_index = "int", sentence_text = "text", tokens = "text[]", lemmas = "text[]", pos_tag = "text[]", ner_tag = "text[]", doc_offsets = "int[]", dep_types = "text[]", dep_tokens = "int[]" :[]) def extract( doc_id = "text", content ="text", ): if content is not None and len(content)!=0: log("doc_id====="+str(doc_id)) split_patten = "。" sentences = re.split(split_patten,content) sentences = [x for x in sentences if len(x)!=0] lemmas = [] doc_offsets = [] dep_types = [] dep_tokens = [] ''' tokens_all = fool.cut(sentences) ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all) for sentence_index in range(len(tokens_all)): sentence_text = sentences[sentence_index] tokens = tokens_all[sentence_index] pos_tag = [] for i in zip(*fool.pos_cut(sentence_text)): pos_tag.append(i[0][1]) ner_tag = ner_tag_all[sentence_index] yield[ doc_id, sentence_index, sentence_text, tokens, lemmas, pos_tag, ner_tag, doc_offsets, dep_types, dep_tokens,] ''' tokens_all = fool.cut(sentences) pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all) ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all) for sentence_index in range(len(sentences)): sentence_text = sentences[sentence_index] tokens = tokens_all[sentence_index] pos_tag = pos_all[sentence_index] ner_tag = ner_tag_all[sentence_index] yield[ doc_id, sentence_index, sentence_text, tokens, lemmas, pos_tag, ner_tag, doc_offsets, dep_types, dep_tokens,]