csv_to_iepy.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. """
  2. Birthdate corpus preprocessing script.
  3. Usage:
  4. csv_to_iepy.py <filename>
  5. csv_to_iepy.py -h | --help
  6. The <filename> argument can be a .csv file or a .csv.gz file containing the
  7. corpus in two columns: 'freebase_mid' and 'description'.
  8. Options:
  9. -h --help Show this screen
  10. """
  11. import logging
  12. import csv
  13. import gzip
  14. import os
  15. from docopt import docopt
  16. from iepy.data.db import DocumentManager
  17. if __name__ == "__main__":
  18. logging.basicConfig(level=logging.INFO,
  19. format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
  20. opts = docopt(__doc__, version=0.1)
  21. name = opts["<filename>"]
  22. if name.endswith(".gz"):
  23. fin = gzip.open(name, "rt")
  24. else:
  25. fin = open(name, "rt")
  26. reader = csv.DictReader(fin)
  27. name = os.path.basename(name)
  28. docdb = DocumentManager()
  29. seen = set()
  30. for i, d in enumerate(reader):
  31. mid = d["freebase_mid"]
  32. if mid in seen:
  33. continue
  34. seen.add(mid)
  35. docdb.create_document(identifier=mid,
  36. text=d["description"],
  37. metadata={"input_filename": name})