preprocess.py 951 B

12345678910111213141516171819202122232425262728293031323334
  1. """
  2. Birthdate corpus preprocessing script
  3. Usage:
  4. preprocess.py
  5. preprocess.py -h | --help | --version
  6. Options:
  7. -h --help Show this screen
  8. --version Version number
  9. """
  10. import logging
  11. from docopt import docopt
  12. from iepy.data.db import DocumentManager
  13. from iepy.preprocess.stanford_preprocess import StanfordPreprocess
  14. from iepy.preprocess.pipeline import PreProcessPipeline
  15. from iepy.preprocess.segmenter import SyntacticSegmenterRunner
  16. if __name__ == '__main__':
  17. logger = logging.getLogger(u'preprocess')
  18. logger.setLevel(logging.INFO)
  19. logging.basicConfig(level=logging.INFO,
  20. format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
  21. opts = docopt(__doc__, version=0.1)
  22. docs = DocumentManager()
  23. pipeline = PreProcessPipeline([
  24. StanfordPreprocess(),
  25. SyntacticSegmenterRunner(increment=True)
  26. ], docs
  27. )
  28. pipeline.process_everything()