tokenizer.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import nltk.data
  4. from nltk.tokenize import RegexpTokenizer
  5. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  6. from iepy.utils import DIRS
  7. class TokenizeSentencerRunner(BasePreProcessStepRunner):
  8. """Does Tokenization and segmentation togheter over IEDocuments.
  9. - If override=True, no matter if any of those steps were
  10. already computed or not, will do and store them on the doc.
  11. - If override=False, but one of the 2 things is done and the other
  12. not, will behave completely like in the case of override=True.
  13. - If override=False and both steps were done on the document, will
  14. do nothing.
  15. """
  16. step = PreProcessSteps.tokenization
  17. def __init__(self, override=False, increment=False, lang='en'):
  18. if lang != 'en':
  19. # We are right now only providing english tokenization
  20. # and segmentation. But if you need something else, this
  21. # is a good place to do it.
  22. raise NotImplemented
  23. self.lang = lang
  24. self.override = override
  25. self.increment = increment
  26. # we'll be doing 2 PreProcess in one here
  27. self.tkn_step = PreProcessSteps.tokenization
  28. self.snt_step = PreProcessSteps.sentencer
  29. def __call__(self, doc):
  30. tkn_done = doc.was_preprocess_step_done(self.tkn_step)
  31. snt_done = doc.was_preprocess_step_done(self.snt_step)
  32. if self.override or not (tkn_done and snt_done):
  33. # Ok, let's do it
  34. result = en_tokenize_and_segment(doc.text)
  35. doc.set_tokenization_result(
  36. list(zip(result['spans'], result['tokens'])))
  37. doc.set_sentencer_result(result['sentences'])
  38. doc.save()
  39. def en_tokenize_and_segment(text):
  40. """
  41. Tokenizes and segments a string `text` interpreted as english text.
  42. Returns a tuple `(tokens, spans, sentences)` where:
  43. - tokens is a list of strings corresponding to the tokens in `text`.
  44. - spans is a list of the starting offsets i in `text` for each token in
  45. `tokens`.
  46. - sentences is a list of indexes that represent the start and end
  47. position of each sentence like this: the i-th sentence starts on
  48. token `sentences[i]` and ends on token `sentences[i + 1]`.
  49. There are `len(sentences) - 1` sentences represented in the list.
  50. """
  51. tokenizer = _get_tokenizer()
  52. tokens = []
  53. spans = []
  54. sentences = [0]
  55. for sentence_i, sentence_j, sentence in _split_in_sentences(text):
  56. if sentence_i == sentence_j:
  57. continue
  58. for i, j in tokenizer.span_tokenize(sentence):
  59. spans.append(sentence_i + i)
  60. tokens.append(sentence[i:j])
  61. sentences.append(len(tokens))
  62. return {'tokens': tokens,
  63. 'spans': spans,
  64. 'sentences': sentences}
  65. def _split_in_sentences(text):
  66. if not nltk.data.path or nltk.data.path[-1] != DIRS.user_data_dir:
  67. nltk.data.path.append(DIRS.user_data_dir)
  68. sentence_splitter = nltk.data.load("tokenizers/punkt/english.pickle")
  69. for i, j in sentence_splitter.span_tokenize(text):
  70. yield i, j, text[i:j]
  71. ###
  72. ### English tokenizer using regular expressions
  73. ###
  74. basic_macros = {
  75. "AN1": "[a-z0-9]",
  76. "AN2": "[a-z0-9\\._]",
  77. "AN3": r"[a-z0-9-_\.~!*'();:@&=+$,/?%#\[\]]"
  78. }
  79. macros = {
  80. "USERNAME": "{AN1}{AN2}*",
  81. "HOSTNAME": "{AN1}{AN2}*",
  82. "HOSTNAME2": r"{AN1}{AN2}*\.{AN2}*",
  83. "HOSTNAME3": r"{AN1}{AN2}*(:[0-9]{{1,5}})?",
  84. "HOSTNAME4": r"www\.{AN1}{AN2}*\.{AN2}*(:[0-9]{{1,5}})?",
  85. "SCHEME": "mailto:|((http|https|ftp|ftps|ssh|git|news)://)",
  86. }
  87. macros = {k: "(" + v.format(**basic_macros) + ")"
  88. for k, v in macros.items()}
  89. macros.update(basic_macros)
  90. # Smiley detection
  91. eyes = ":;8xX>="
  92. noses = [""] + list("-o")
  93. mouths = list("DP/") + ["}}", "{{", "\\[", "\\]", "\\(", "\\)", "\\|"]
  94. smileys = [x + y + z for x in eyes for y in noses for z in mouths]
  95. HEADER = [
  96. "([01]?[0-9]|2[0-4]):[0-5]?[0-9](:[0-5]?[0-9])?", # Time of day
  97. "''|``", # Quotation
  98. "{USERNAME}@{HOSTNAME2}", # Typical email
  99. "{SCHEME}({USERNAME}@)?{HOSTNAME3}(/{AN3}*)?", # URI
  100. "{HOSTNAME4}", # Typical URL
  101. ]
  102. FOOTER = [
  103. "\w+&\w+", # And words
  104. "\w+", # Normal words
  105. "|".join(smileys), # Smileys
  106. "[()/\[\]\\.,;:\-\"'`~?]|\\.\\.\\.", # Punctuation marks
  107. "\S+", # Anything else
  108. ]
  109. english_contractions = [
  110. "ain't",
  111. "aren't",
  112. "can't",
  113. "can't've",
  114. "'cause",
  115. "could've",
  116. "couldn't",
  117. "couldn't've",
  118. "didn't",
  119. "doesn't",
  120. "don't",
  121. "hadn't",
  122. "hadn't've",
  123. "hasn't",
  124. "haven't",
  125. "he'd",
  126. "he'd've",
  127. "he'll",
  128. "he'll've",
  129. "he's",
  130. "how'd",
  131. "how'd'y",
  132. "how'll",
  133. "how's",
  134. "I'd",
  135. "I'd've",
  136. "I'll",
  137. "I'll've",
  138. "I'm",
  139. "I've",
  140. "isn't",
  141. "it'd",
  142. "it'd've",
  143. "it'll",
  144. "it'll've",
  145. "it's",
  146. "let's",
  147. "ma'am",
  148. "might've",
  149. "mightn't",
  150. "mightn't've",
  151. "must've",
  152. "mustn't",
  153. "mustn't've",
  154. "needn't",
  155. "o'clock",
  156. "oughtn't",
  157. "oughtn't've",
  158. "shan't",
  159. "shan't've",
  160. "she'd",
  161. "she'd've",
  162. "she'll",
  163. "she'll've",
  164. "she's",
  165. "should've",
  166. "shouldn't",
  167. "shouldn't've",
  168. "so's",
  169. "that's",
  170. "there'd",
  171. "there's",
  172. "they'd",
  173. "they'll",
  174. "they'll've",
  175. "they're",
  176. "they've",
  177. "to've",
  178. "wasn't",
  179. "we'd",
  180. "we'll",
  181. "we'll've",
  182. "we're",
  183. "we've",
  184. "weren't",
  185. "what'll",
  186. "what'll've",
  187. "what're",
  188. "what's",
  189. "what've",
  190. "when's",
  191. "when've",
  192. "where'd",
  193. "where's",
  194. "where've",
  195. "who'll",
  196. "who'll've",
  197. "who's",
  198. "who've",
  199. "why's",
  200. "will've",
  201. "won't",
  202. "won't've",
  203. "would've",
  204. "wouldn't",
  205. "wouldn't've",
  206. "y'all",
  207. "y'all'd've",
  208. "y'all're",
  209. "y'all've",
  210. "you'd",
  211. "you'd've",
  212. "you'll",
  213. "you'll've",
  214. "you're",
  215. "you've"]
  216. en_regex = HEADER + [
  217. "[01]?[0-9][-/.][0123]?[0-9][-/.][0-9]{{2,4}}", # Date mm/dd/yyyy
  218. "|".join(english_contractions), # Common contractions
  219. "'s", # Possesive
  220. "\w+([_-]\w+)+", # Normal words+compounds
  221. ] + FOOTER
  222. def _get_tokenizer(__cache=[]):
  223. """
  224. Get a tokenizer for english.
  225. """
  226. if not __cache:
  227. regex = [x.format(**macros) for x in en_regex]
  228. regex = u"|".join(regex)
  229. tokenizer = RegexpTokenizer(regex, flags=re.UNICODE |
  230. re.MULTILINE |
  231. re.DOTALL | re.I)
  232. __cache.append(tokenizer)
  233. return __cache[0]