123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """Tokenisation related functionality.
- Author: Pontus Stenetorp <pontus stenetorp se>
- Version: 2011-05-23
- """
- def _token_boundaries_by_alignment(tokens, original_text):
- curr_pos = 0
- for tok in tokens:
- start_pos = original_text.index(tok, curr_pos)
- # TODO: Check if we fail to find the token!
- end_pos = start_pos + len(tok)
- yield (start_pos, end_pos)
- curr_pos = end_pos
- def jp_token_boundary_gen(text):
- try:
- from mecab import token_offsets_gen
- for o in token_offsets_gen(text):
- yield o
- except ImportError:
- from message import Messager
- Messager.error('Failed to import MeCab, '
- 'falling back on whitespace tokenization. '
- 'Please check configuration and/or server setup.')
- for o in whitespace_token_boundary_gen(text):
- yield o
- def gtb_token_boundary_gen(text):
- from gtbtokenize import tokenize
- tokens = tokenize(text).split()
- for o in _token_boundaries_by_alignment(tokens, text):
- yield o
- def whitespace_token_boundary_gen(text):
- tokens = text.split()
- for o in _token_boundaries_by_alignment(tokens, text):
- yield o
- if __name__ == '__main__':
- from sys import argv
- from annotation import open_textfile
- def _text_by_offsets_gen(text, offsets):
- for start, end in offsets:
- yield text[start:end]
- if len(argv) == 1:
- argv.append('/dev/stdin')
- try:
- for txt_file_path in argv[1:]:
- print()
- print('### Tokenising:', txt_file_path)
- with open(txt_file_path, 'r') as txt_file:
- text = txt_file.read()
- print(text)
- print('# Original text:')
- print(text.replace('\n', '\\n'))
- #offsets = [o for o in jp_token_boundary_gen(text)]
- #offsets = [o for o in whitespace_token_boundary_gen(text)]
- offsets = [o for o in gtb_token_boundary_gen(text)]
- print('# Offsets:')
- print(offsets)
- print('# Tokens:')
- for tok in _text_by_offsets_gen(text, offsets):
- assert tok, 'blank tokens disallowed'
- assert not tok[0].isspace() and not tok[-1].isspace(), (
- 'tokens may not start or end with white-space "%s"' % tok)
- print('"%s"' % tok)
- except IOError:
- raise
|