ssplit.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """Primitive sentence splitting using Sampo Pyysalo's GeniaSS sentence split
  4. refiner. Also a primitive Japanese sentence splitter without refinement.
  5. Author: Pontus Stenetorp <pontus stenetorp se>
  6. Version: 2011-05-09
  7. """
  8. from re import compile as re_compile
  9. from re import DOTALL, VERBOSE
  10. # Constants
  11. # Reasonably well-behaved sentence end regular expression
  12. SENTENCE_END_REGEX = re_compile(r'''
  13. # Require a leading non-whitespace character for the sentence
  14. \S
  15. # Then, anything goes, but don't be greedy
  16. .*?
  17. # Anchor the sentence at...
  18. (:?
  19. # One (or multiple) terminal character(s)
  20. # followed by one (or multiple) whitespace
  21. (:?(\.|!|\?|。|!|?)+(?=\s+))
  22. | # Or...
  23. # Newlines, to respect file formatting
  24. (:?(?=\n+))
  25. | # Or...
  26. # End-of-file, excluding whitespaces before it
  27. (:?(?=\s*$))
  28. )
  29. ''', DOTALL | VERBOSE)
  30. # Only newlines can end a sentence to preserve pre-processed formatting
  31. SENTENCE_END_NEWLINE_REGEX = re_compile(r'''
  32. # Require a leading non-whitespace character for the sentence
  33. \S
  34. # Then, anything goes, but don't be greedy
  35. .*?
  36. # Anchor the sentence at...
  37. (:?
  38. # One (or multiple) newlines
  39. (:?(?=\n+))
  40. | # Or...
  41. # End-of-file, excluding whitespaces before it
  42. (:?(?=\s*$))
  43. )
  44. ''', DOTALL | VERBOSE)
  45. ###
  46. def _refine_split(offsets, original_text):
  47. # Postprocessor expects newlines, so add. Also, replace
  48. # sentence-internal newlines with spaces not to confuse it.
  49. new_text = '\n'.join((original_text[o[0]:o[1]].replace('\n', ' ')
  50. for o in offsets))
  51. from sspostproc import refine_split
  52. output = refine_split(new_text)
  53. # Align the texts and see where our offsets don't match
  54. old_offsets = offsets[::-1]
  55. # Protect against edge case of single-line docs missing
  56. # sentence-terminal newline
  57. if len(old_offsets) == 0:
  58. old_offsets.append((0, len(original_text), ))
  59. new_offsets = []
  60. for refined_sentence in output.split('\n'):
  61. new_offset = old_offsets.pop()
  62. # Merge the offsets if we have received a corrected split
  63. while new_offset[1] - new_offset[0] < len(refined_sentence) - 1:
  64. _, next_end = old_offsets.pop()
  65. new_offset = (new_offset[0], next_end)
  66. new_offsets.append(new_offset)
  67. # Protect against missing document-final newline causing the last
  68. # sentence to fall out of offset scope
  69. if len(new_offsets) != 0 and new_offsets[-1][1] != len(original_text) - 1:
  70. start = new_offsets[-1][1] + 1
  71. while start < len(original_text) and original_text[start].isspace():
  72. start += 1
  73. if start < len(original_text) - 1:
  74. new_offsets.append((start, len(original_text) - 1))
  75. # Finally, inject new-lines from the original document as to respect the
  76. # original formatting where it is made explicit.
  77. last_newline = -1
  78. while True:
  79. try:
  80. orig_newline = original_text.index('\n', last_newline + 1)
  81. except ValueError:
  82. # No more newlines
  83. break
  84. for o_start, o_end in new_offsets:
  85. if o_start <= orig_newline < o_end:
  86. # We need to split the existing offsets in two
  87. new_offsets.remove((o_start, o_end))
  88. new_offsets.extend(((o_start, orig_newline, ),
  89. (orig_newline + 1, o_end), ))
  90. break
  91. elif o_end == orig_newline:
  92. # We have already respected this newline
  93. break
  94. else:
  95. # Stand-alone "null" sentence, just insert it
  96. new_offsets.append((orig_newline, orig_newline, ))
  97. last_newline = orig_newline
  98. new_offsets.sort()
  99. return new_offsets
  100. def _sentence_boundary_gen(text, regex):
  101. for match in regex.finditer(text):
  102. yield match.span()
  103. def regex_sentence_boundary_gen(text):
  104. for o in _refine_split([_o for _o in _sentence_boundary_gen(
  105. text, SENTENCE_END_REGEX)], text):
  106. yield o
  107. def newline_sentence_boundary_gen(text):
  108. for o in _sentence_boundary_gen(text, SENTENCE_END_NEWLINE_REGEX):
  109. yield o
  110. if __name__ == '__main__':
  111. from sys import argv
  112. from annotation import open_textfile
  113. def _text_by_offsets_gen(text, offsets):
  114. for start, end in offsets:
  115. yield text[start:end]
  116. if len(argv) > 1:
  117. try:
  118. for txt_file_path in argv[1:]:
  119. print()
  120. print('### Splitting:', txt_file_path)
  121. with open_textfile(txt_file_path, 'r') as txt_file:
  122. text = txt_file.read()
  123. print('# Original text:')
  124. print(text.replace('\n', '\\n'))
  125. offsets = [o for o in newline_sentence_boundary_gen(text)]
  126. print('# Offsets:')
  127. print(offsets)
  128. print('# Sentences:')
  129. for sentence in _text_by_offsets_gen(text, offsets):
  130. # These should only be allowed when coming from original
  131. # explicit newlines.
  132. #assert sentence, 'blank sentences disallowed'
  133. # assert not sentence[0].isspace(), (
  134. # 'sentence may not start with white-space "%s"' % sentence)
  135. print('"%s"' % sentence.replace('\n', '\\n'))
  136. except IOError:
  137. pass # Most likely a broken pipe
  138. else:
  139. sentence = 'This is a short sentence.\nthis is another one.'
  140. print('Sentence:', sentence)
  141. print('Len sentence:', len(sentence))
  142. ret = [o for o in en_sentence_boundary_gen(sentence)]
  143. last_end = 0
  144. for start, end in ret:
  145. if last_end != start:
  146. print('DROPPED: "%s"' % sentence[last_end:start])
  147. print('SENTENCE: "%s"' % sentence[start:end])
  148. last_end = end
  149. print(ret)
  150. sentence = ' 変しん! 両になった。うそ! かも '
  151. print('Sentence:', sentence)
  152. print('Len sentence:', len(sentence))
  153. ret = [o for o in jp_sentence_boundary_gen(sentence)]
  154. ans = [(1, 5), (6, 12), (12, 15), (16, 18)]
  155. assert ret == ans, '%s != %s' % (ret, ans)
  156. print('Successful!')
  157. sentence = ' One of these days Jimmy, one of these days. Boom! Kaboom '
  158. print('Sentence:', sentence)
  159. print('Len sentence:', len(sentence))
  160. ret = [o for o in en_sentence_boundary_gen(sentence)]
  161. ans = [(1, 44), (45, 50), (51, 57)]
  162. assert ret == ans, '%s != %s' % (ret, ans)
  163. print('Successful!')