tokenise.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """Tokenisation related functionality.
  4. Author: Pontus Stenetorp <pontus stenetorp se>
  5. Version: 2011-05-23
  6. """
  7. def _token_boundaries_by_alignment(tokens, original_text):
  8. curr_pos = 0
  9. for tok in tokens:
  10. start_pos = original_text.index(tok, curr_pos)
  11. # TODO: Check if we fail to find the token!
  12. end_pos = start_pos + len(tok)
  13. yield (start_pos, end_pos)
  14. curr_pos = end_pos
  15. def jp_token_boundary_gen(text):
  16. try:
  17. from mecab import token_offsets_gen
  18. for o in token_offsets_gen(text):
  19. yield o
  20. except ImportError:
  21. from message import Messager
  22. Messager.error('Failed to import MeCab, '
  23. 'falling back on whitespace tokenization. '
  24. 'Please check configuration and/or server setup.')
  25. for o in whitespace_token_boundary_gen(text):
  26. yield o
  27. def gtb_token_boundary_gen(text):
  28. from gtbtokenize import tokenize
  29. tokens = tokenize(text).split()
  30. for o in _token_boundaries_by_alignment(tokens, text):
  31. yield o
  32. def whitespace_token_boundary_gen(text):
  33. tokens = text.split()
  34. for o in _token_boundaries_by_alignment(tokens, text):
  35. yield o
  36. if __name__ == '__main__':
  37. from sys import argv
  38. from annotation import open_textfile
  39. def _text_by_offsets_gen(text, offsets):
  40. for start, end in offsets:
  41. yield text[start:end]
  42. if len(argv) == 1:
  43. argv.append('/dev/stdin')
  44. try:
  45. for txt_file_path in argv[1:]:
  46. print()
  47. print('### Tokenising:', txt_file_path)
  48. with open(txt_file_path, 'r') as txt_file:
  49. text = txt_file.read()
  50. print(text)
  51. print('# Original text:')
  52. print(text.replace('\n', '\\n'))
  53. #offsets = [o for o in jp_token_boundary_gen(text)]
  54. #offsets = [o for o in whitespace_token_boundary_gen(text)]
  55. offsets = [o for o in gtb_token_boundary_gen(text)]
  56. print('# Offsets:')
  57. print(offsets)
  58. print('# Tokens:')
  59. for tok in _text_by_offsets_gen(text, offsets):
  60. assert tok, 'blank tokens disallowed'
  61. assert not tok[0].isspace() and not tok[-1].isspace(), (
  62. 'tokens may not start or end with white-space "%s"' % tok)
  63. print('"%s"' % tok)
  64. except IOError:
  65. raise