Source code for dcs_wrapper.wrapper

'''
A simple wrapper around the DCS database that supports iteration

@author: avinashvarna
'''

from __future__ import print_function
import os
import logging
import codecs
import tarfile

try:
    from functools import partialmethod
except ImportError:
    from backports.functools_partialmethod import partialmethod


from .base import from_json
from .utils import app_dir


__docformat__ = 'reStructuredText'


[docs]class DCS(object): '''Simple wrapper around the DCS database supporting iteration :param file_path: Use the data in directory file_path. file_path = None defaults to using data supplied with the package which will be extracted on first use :type file_path: str, None :Example: >>> from dcs_wrapper import DCS >>> with DCS() as dcs: >>> for book in dcs.iter_books(): >>> print(book.dcsId, book.title) >>> for chapter in dcs.iter_chapters(): >>> print(chapter.dcsId, chapter.dcsName) >>> for sentence in dcs.iter_sentences(): >>> print(sentence.dcsId, sentence.text) ''' def __init__(self, file_path=None): self.logger = logging.getLogger(__name__) self._setup_directory(file_path) def _setup_directory(self, file_path): data_file = os.path.join(os.path.dirname(__file__), "data", "dcs_data.tar.gz") self.file_path = file_path or app_dir('DCS_Wrapper') self.sentences_file = os.path.join(self.file_path, 'dcs_sentences_json.txt') self.books_file = os.path.join(self.file_path, 'dcs_books_json.txt') self.chapters_file = os.path.join(self.file_path, 'dcs_chapters_json.txt') file_list = [self.books_file, self.chapters_file, self.sentences_file] if not all(map(os.path.exists, file_list)): self.logger.info("Unzipping data files for first use") with tarfile.open(data_file, 'r') as tar: tar.extractall(self.file_path) assert all(map(os.path.exists, file_list)), "One of %s does not exist" % (file_list) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): pass def _iter_file(self, category): if category == "books": filename = self.books_file elif category == "chapters": filename = self.chapters_file elif category == "sentences": filename = self.sentences_file with codecs.open(filename, "rb", "utf8") as f: for line in f: yield from_json(line) iter_sentences = partialmethod(_iter_file, "sentences") '''Iterate over sentences in DCS database, yielding one :class:`~dcs_wrapper.Sentence` at a time''' iter_books = partialmethod(_iter_file, "books") '''Iterate over books in DCS database, yielding one :class:`~dcs_wrapper.Book` at a time''' iter_chapters = partialmethod(_iter_file, "chapters") '''Iterate over chapters in DCS database, yielding one :class:`~dcs_wrapper.Chapter` object at a time'''