nlp_markup_with_foolnltk.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. #from transform import *
  5. import re
  6. import fool
  7. from commonutil import *
  8. @tsv_extractor
  9. @returns(lambda
  10. doc_id = "text",
  11. sentence_index = "int",
  12. sentence_text = "text",
  13. tokens = "text[]",
  14. lemmas = "text[]",
  15. pos_tag = "text[]",
  16. ner_tag = "text[]",
  17. doc_offsets = "int[]",
  18. dep_types = "text[]",
  19. dep_tokens = "int[]"
  20. :[])
  21. def extract(
  22. doc_id = "text",
  23. content ="text",
  24. ):
  25. if content is not None and len(content)!=0:
  26. log("doc_id====="+str(doc_id))
  27. split_patten = "。"
  28. sentences = re.split(split_patten,content)
  29. sentences = [x for x in sentences if len(x)!=0]
  30. lemmas = []
  31. doc_offsets = []
  32. dep_types = []
  33. dep_tokens = []
  34. '''
  35. tokens_all = fool.cut(sentences)
  36. ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  37. for sentence_index in range(len(tokens_all)):
  38. sentence_text = sentences[sentence_index]
  39. tokens = tokens_all[sentence_index]
  40. pos_tag = []
  41. for i in zip(*fool.pos_cut(sentence_text)):
  42. pos_tag.append(i[0][1])
  43. ner_tag = ner_tag_all[sentence_index]
  44. yield[
  45. doc_id,
  46. sentence_index,
  47. sentence_text,
  48. tokens,
  49. lemmas,
  50. pos_tag,
  51. ner_tag,
  52. doc_offsets,
  53. dep_types,
  54. dep_tokens,]
  55. '''
  56. tokens_all = fool.cut(sentences)
  57. pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
  58. ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  59. for sentence_index in range(len(sentences)):
  60. sentence_text = sentences[sentence_index]
  61. tokens = tokens_all[sentence_index]
  62. pos_tag = pos_all[sentence_index]
  63. ner_tag = ner_tag_all[sentence_index]
  64. yield[
  65. doc_id,
  66. sentence_index,
  67. sentence_text,
  68. tokens,
  69. lemmas,
  70. pos_tag,
  71. ner_tag,
  72. doc_offsets,
  73. dep_types,
  74. dep_tokens,]