luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
							#!/usr/bin/env python
#encoding:utf-8
from deepdive import *
#from transform import *
import re 
import fool 
from commonutil import *


@tsv_extractor
@returns(lambda
        doc_id          = "text",
        sentence_index  = "int",
        sentence_text    = "text",
        tokens          = "text[]",
        lemmas          = "text[]",
        pos_tag         = "text[]",
        ner_tag         = "text[]",
        doc_offsets     = "int[]",
        dep_types       = "text[]",
        dep_tokens      = "int[]"
    :[])
def extract(
        doc_id          = "text",
        content         ="text",
    ):
    if content is not None and len(content)!=0:
        log("doc_id====="+str(doc_id))
        split_patten = "。"
        sentences = re.split(split_patten,content)
        sentences = [x for x in sentences if len(x)!=0]
        
        lemmas = []
        doc_offsets = []
        dep_types = []
        dep_tokens = []
        
        
        '''
        tokens_all = fool.cut(sentences)
        ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
        
        for sentence_index in range(len(tokens_all)):
            sentence_text = sentences[sentence_index]
            tokens = tokens_all[sentence_index]
            pos_tag = []
            for i in zip(*fool.pos_cut(sentence_text)):
                pos_tag.append(i[0][1])
            ner_tag = ner_tag_all[sentence_index]
            yield[
                  doc_id,
                  sentence_index,
                  sentence_text,
                  tokens,
                  lemmas,
                  pos_tag,
                  ner_tag,
                  doc_offsets,
                  dep_types,
                  dep_tokens,]
        '''
        tokens_all = fool.cut(sentences)
        pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
        ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
        
        for sentence_index in range(len(sentences)):
            sentence_text = sentences[sentence_index]
            tokens = tokens_all[sentence_index]
            pos_tag = pos_all[sentence_index]
            ner_tag = ner_tag_all[sentence_index]
            yield[
                  doc_id,
                  sentence_index,
                  sentence_text,
                  tokens,
                  lemmas,
                  pos_tag,
                  ner_tag,
                  doc_offsets,
                  dep_types,
                  dep_tokens,]