gazettes_loader.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. IEPY gazettes loader
  3. Usage:
  4. gazettes_loader.py <filename>
  5. The <filename> argument can be a .csv file or a .csv.gz file containing the
  6. gazettes in two columns: 'literal' and 'class'.
  7. Options:
  8. -h --help Show this screen
  9. """
  10. import sys
  11. import csv
  12. import gzip
  13. import logging
  14. from operator import itemgetter
  15. from django.db import IntegrityError
  16. from docopt import docopt
  17. import iepy
  18. iepy.setup(__file__)
  19. from iepy.data.models import EntityKind, GazetteItem
  20. logging.basicConfig(level=logging.INFO, format='%(message)s')
  21. def add_gazettes_from_csv(filepath):
  22. if filepath.endswith(".gz"):
  23. fin = gzip.open(filepath, "rt")
  24. else:
  25. fin = open(filepath, "rt")
  26. reader = csv.DictReader(fin)
  27. expected_fnames = ['literal', 'class']
  28. if not set(reader.fieldnames).issuperset(expected_fnames):
  29. msg = "Couldn't find the expected field names on the provided csv: {}"
  30. sys.exit(msg.format(expected_fnames))
  31. _create_gazette_entries(
  32. itemgetter(*expected_fnames)(line) for line in reader
  33. )
  34. def _create_gazette_entries(entries_list):
  35. kind_cache = {}
  36. created = 0
  37. for literal, kind_name in entries_list:
  38. literal = literal.strip()
  39. kind_name = kind_name.strip()
  40. kind = kind_cache.get(kind_name)
  41. if kind is None:
  42. kind, _ = EntityKind.objects.get_or_create(name=kind_name)
  43. kind_cache[kind_name] = kind
  44. gazette = GazetteItem(text=literal, kind=kind)
  45. try:
  46. gazette.save()
  47. except IntegrityError as error:
  48. logging.warn(
  49. "Gazette '{}' of class '{}' not loaded, literal already existed".format(
  50. literal, kind_name))
  51. print(error)
  52. finally:
  53. created += 1
  54. print('Created {} new gazette items'.format(created))
  55. if __name__ == "__main__":
  56. opts = docopt(__doc__, version=iepy.__version__)
  57. fname = opts["<filename>"]
  58. add_gazettes_from_csv(fname)