Skip to content

DHTK

dhtk.core.system

Utility functions used by all modules.

download_files(urls, path='./', file_names='')

Function to download files from the Web.

Parameters:

Name Type Description Default
urls Union[str, List[str]]

URLs of the files to be downloaded

required
path str

Path to directory to store the downloaded files. (Default value = "./")

'./'
file_names Union[str, List[str]

Output name or names of the files to be downloaded.

''
Source code in dhtk/core/system.py
def download_files(
        urls: typing.Union[str, typing.List[str]],
        path: str = "./",
        file_names: str = "") -> \
        typing.Union[str, typing.List[str]]:
    """Function to download files from the Web.

    Args:
      urls (typing.Union[str, typing.List[str]]): URLs of the files to be downloaded
      path (str, optional): Path to directory to store the downloaded files. (Default value = "./")
      file_names (typing.Union[str, typing.List[str], optional): Output name or names of the files to be downloaded.

    """

    # Convert inputs to list
    if isinstance(urls, str):
        urls = [urls]

    path = pathlib.Path(path)
    path.mkdir(exist_ok=True)

    # Request all files
    get_file_names = False
    if not file_names:
        file_names = []
        get_file_names = True
    elif isinstance(file_names, str):
        file_names = [file_names, ]
    elif isinstance(file_names, list):
        if len(file_names) != len(urls):
            raise IndexError("The list of filenames should correspond to the list of urls.")
    for index, url in enumerate(urls):

        if url_exists(url):  # Check if URL is valid
            if get_file_names:
                file_name = url.split('/')[-1]
                file_name = file_name.split('?')[0]
            else:
                file_name = file_names[index]

            file_paths = [path / file for file in file_names]
            if all(file.exists() for file in file_paths):
                if len(file_paths) == 1:
                    return file_paths[0]
                return file_paths
            # Make request
            headers = {'User-Agent': get_platform()}
            with requests.get(url, stream=True, headers=headers) as request:
                request.raise_for_status()
                chunk = 8192
                total = int(request.headers['Content-Length'])

                # Read to file
                with open(path / file_name, 'wb') as out_file:
                    with tqdm.tqdm(total=total, desc=f"Downloading {file_name}") as progress_bar:
                        for part in request.iter_content(chunk_size=chunk):
                            out_file.write(part)
                            progress_bar.update(chunk)
                logger.info("DOWNLOAD: %s downloaded from  %s to %s", file_name, url, path)
        else:  # Warn if URL is not valid
            msg = f"URL not available: {url}"
            warnings.warn(msg)
            logger.warning("DOWNLOAD: %s", msg)

    if len(file_names) == 1:  # Return a string if there is only one file
        file_names = file_names[0]

    return file_names

get_date(url)

Function to get last modified date of a remote file

Parameters:

Name Type Description Default
url str

the url link

required

Returns:

Type Description
datetime (datetime)

the datatime object

Source code in dhtk/core/system.py
def get_date(url: str) -> datetime.datetime:
    """Function to get last modified date of a remote file

    Args:
      url (str): the url link

    Returns:
      datetime (datetime): the datatime object


    """

    headers = {'User-Agent': get_platform()}
    request = requests.head(url, stream=True, headers=headers)
    request = request.headers['last-modified']
    last_update = datetime.datetime.strptime(request, '%a, %d %b %Y %H:%M:%S %Z')

    return last_update

get_platform()

Returns computes platform.

Source code in dhtk/core/system.py
def get_platform():
    """Returns computes platform."""
    return f'"({platform.system()}; U; {platform.architecture()[0]}; en-us)"'

make_dirs(directories)

Function to create new directories at DHTK's initiation.

Parameters:

Name Type Description Default
directories Union[str, List[str]]

the new directory path

required
Source code in dhtk/core/system.py
def make_dirs(directories: typing.Union[str, typing.List[str]]) -> None:
    """Function to create new directories at DHTK's initiation.

    Args:
      directories (typing.Union[str, typing.List[str]]): the new directory path

    """
    # Convert values to list
    if not isinstance(directories, list):
        directories = [directories]

    # For each directory, confirm value is a pathlib.Path object and directory doesn't exist
    for directory in directories:
        directory = pathlib.Path(directory)

        if not directory.is_dir():
            try:
                directory.mkdir(parents=True, exist_ok=True)
            except (IOError, PermissionError):
                sys.exit(f"Could not create {directory}, please check user rights.")

pip_install(module_type, module)

Helper function to install missing modules

Parameters:

Name Type Description Default
module_type str

possible values "data_sources" or "storage"

required
module

"the dhtk module name"

required

Returns:

Type Description

the imported module.

Source code in dhtk/core/system.py
def pip_install(module_type, module):
    """
    Helper function to install missing modules

    Args:
        module_type (str): possible values "data_sources"  or "storage"
        module: "the dhtk module name"

    Returns:
        the imported module.
    """
    if 'dummy' in module:
        module_name = f"dhtk_{module_type.rstrip('s')}_{module}"
        module_import = f"dhtk.{module_type}.{module}"
        git_url = f"git+ssh://git@gitlab.com/dhtk/dhtk_{module_type}/examples/{module_name}"
    else:
        module_name = f"dhtk_{module_type}_{module}"
        module_import = f"dhtk.{module_type.rstrip('s')}.{module}"
        git_url = f"git+ssh://git@gitlab.com/dhtk/dhtk_{module_type}s/{module_name}"
    if not IS_INTERACTIVE:
        raise EnvironmentError(f"This method is for interactive usage only! Please install {module_name} manually:"
                               f"$ pip install {git_url}")
    answer = input(f"Do you want dhtk to install {module_name}? [y/N]")
    if not answer.lower().startswith("y"):
        raise EnvironmentError(f"Please install the module manually: $ pip install {git_url}")
    try:
        cmd = ['-m', 'pip', 'install', git_url]
        subprocess.check_call([sys.executable] + cmd)
    except subprocess.CalledProcessError:
        msg = f"Module {module_name} not available."
        warnings.warn(msg)
        logger.error("DATASET: " + msg)
    return importlib.import_module(
        module_import
    )

url_exists(url)

Function to check if an url is available.

Parameters:

Name Type Description Default
url str

the path to check

required
Source code in dhtk/core/system.py
def url_exists(url: str):
    """Function to check if an url is available.

    Args:
      url (str): the path to check

    """

    # If the URL is a local file
    if url.startswith("file:"):
        return os.path.exists("/" + url.split("/", 1)[1])

    # If URL is a remote file
    try:
        response = requests.head(url)
    except requests.exceptions.ConnectionError:
        return False
    return response.ok

dhtk.data_sources.gutenberg special

Gutenberg extension data_source

Module (AbstractDataSource)

Gutenberg Triplestore Class

Source code in dhtk/data_sources/gutenberg/__init__.py
class Module(AbstractDataSource):
    """Gutenberg Triplestore Class"""
    name = "gutenberg"
    storage_type = "triplestore"
    data_file = "https://sandbox.zenodo.org/record/967894/files/gutenberg-dhtk-reasoned.ttl?download=1"

    @classmethod
    def get_data_file(cls, output_path, storage_type):
        """
        Get a content as defined in self.data_file and write into a file into output_path
        Args:
            output_path (Path): the path where to write (i.e. output_path = Path('WD/gutenberg/data/triplestore'))
            storage_type (str): the type of the storage

        Returns:
            Path: the path to the file with data

        """
        #output_path can be like this output_path = 'WD/gutenberg/data/triplestore'
        #storage_type can be like this storage_type= 'triplestore'
        if isinstance(cls.storage_type, str):
            data_file = cls.data_file
        else:
            data_file = cls.storage_type[cls.storage_type.index(storage_type)]
        download_files(data_file, output_path, "gutenberg.ttl")
        return output_path / "gutenberg.ttl"

    def __init__(self, working_directory, endpoints):

        # Get the extension
        self.wrapper = GutenbergData(sparql_endpoint=endpoints[0])

        # Instantiate a Corpus
        self._corpus = GutenbergCorpus(
            name=f"{self.name.capitalize()} Corpus",
            working_directory=working_directory,
            description="Gutenberg Books"
        )

    def welcome(self):
        stats = None
        for check in range(10):
            try:
                stats = self.wrapper.statistics()
                break
            except (RemoteDisconnected, URLError, ConnectionResetError):
                if check >= 9:
                    warnings.warn("WARNING: There is a problem with the connection!")
                    print("Probably Docker is slow to restart!")
                    stats = "\nNo statistics available"
                    break
                sleep(10)

    def get(self, what, name="all", add=False):
        """
        Extension wrapper method to call all DHTK functionalities with a simple syntax

        Parameters
        ----------
        what: string
            Type of information to retrieve.
            DHTK Gutenberg has the options to search for books, authors, shelves and subjects
        name: string [default: "all"]
            Name identifying the specific information to retrieve.
            If all, retrieve all information available
        add: boolean [default: False]
            Add query results to Corpus

        Returns
        -------
        Requested book information from Gutenberg dataset
        """

        # Prepare arguments
        name = name.strip().lower()
        what = what.strip().lower()

        if what.startswith("bo"):
            if name == "all":
                response = self.wrapper.all_books()
            else:
                response = self.wrapper.search_by_title(name)
                response = [self.wrapper.get_book(book["book_id"]) for book in response]

                if add:
                    self._corpus.add_books(response)

                # Use a short book title
                response = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book
                            for book in response}

        elif what.startswith("au"):
            if name == "all":
                response = self.wrapper.all_authors()
            else:
                response = self.wrapper.search_by_author(name)
                ids = set(author["author_id"] for author in response)
                response = [self.wrapper.get_author(author) for author in ids]

                if add:
                    for author in response:
                        for book in author.get_bibliography():
                            books = self.get(what="book", name=book)

                            books = {title: book for title, book in books.items() \
                                     if book.get_author() == author}

                            # Match short book key name
                            sub_title = re.sub(r"\s+", " ", book[:20])
                            books = [book for key, book in books.items() if sub_title in key]

                            self._corpus.add_books(books)

                response = {f"{author.get_full_name()}": author for author in response}

        elif what.startswith("sh"):
            if name == "all":
                response = self.wrapper.all_bookshelves()
            else:
                response = self.wrapper.search_by_bookshelf(name)
                if add:
                    add = [self.wrapper.get_book(book["book_id"]) for book in response]
                    self._corpus.add_books(add)

                results = {}
                for shelf in response:
                    book = f"{shelf['title']} [{shelf['author']}]"
                    results[shelf['bookshelf']] = results.get(shelf['bookshelf'], [])

                    if book not in results[shelf['bookshelf']]:
                        results[shelf['bookshelf']] += [book]

                response = results

        elif what.startswith("su"):
            if name == "all":
                response = self.wrapper.all_subjects()
            else:
                response = self.wrapper.search_by_subject(name)

                if add:
                    add = [self.wrapper.get_book(book["book_id"]) for book in response]
                    self._corpus.add_books(add)

                results = {}
                for subject in response:
                    book = f"{subject['title']} [{subject['author']}]"
                    results[subject['subject']] = results.get(subject['subject'], [])

                    if book not in results[subject['subject']]:
                        results[subject['subject']] += [book]

                response = results
        else:
            warnings.warn("Not a valid option")
            print("Allowed arguments are:\nbook\nauthor\nshelf\nsubject")

        if not response:
            warnings.warn(f"No {what} found.\n"
                          "Please make sure there are no spelling mistakes.\n"
                          "You may search all available options by ignoring the [name] argument")
        return response

    def corpus(self, name=None, description=None):
        if name is not None:
            self._corpus.name(name)
        if description is not None:
            self._corpus.description(description)

        return self._corpus

    def save(self, close=True):

        # Prepare save directory
        path = pathlib.Path(self._corpus.path())

        if not path.is_dir():
            make_dirs(path)

        # Save Gutenberg Texts
        self._corpus.download_corpus()

        # Save metadata table
        self._corpus.download_metadata()

        # Save Python Query Object
        with open(path / "corpus.pk", 'wb') as pickled_file:
            pickle.dump(self._corpus.to_pandas_dataframe(),
                        pickled_file, protocol=pickle.HIGHEST_PROTOCOL)
        pickled_file.close()

        if close:
            self.close()

get(self, what, name='all', add=False)

Extension wrapper method to call all DHTK functionalities with a simple syntax

Parameters

!!! what "string" Type of information to retrieve. DHTK Gutenberg has the options to search for books, authors, shelves and subjects !!! name "string [default: "all"]" Name identifying the specific information to retrieve. If all, retrieve all information available !!! add "boolean [default: False]" Add query results to Corpus

Returns

Requested book information from Gutenberg dataset

Source code in dhtk/data_sources/gutenberg/__init__.py
def get(self, what, name="all", add=False):
    """
    Extension wrapper method to call all DHTK functionalities with a simple syntax

    Parameters
    ----------
    what: string
        Type of information to retrieve.
        DHTK Gutenberg has the options to search for books, authors, shelves and subjects
    name: string [default: "all"]
        Name identifying the specific information to retrieve.
        If all, retrieve all information available
    add: boolean [default: False]
        Add query results to Corpus

    Returns
    -------
    Requested book information from Gutenberg dataset
    """

    # Prepare arguments
    name = name.strip().lower()
    what = what.strip().lower()

    if what.startswith("bo"):
        if name == "all":
            response = self.wrapper.all_books()
        else:
            response = self.wrapper.search_by_title(name)
            response = [self.wrapper.get_book(book["book_id"]) for book in response]

            if add:
                self._corpus.add_books(response)

            # Use a short book title
            response = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book
                        for book in response}

    elif what.startswith("au"):
        if name == "all":
            response = self.wrapper.all_authors()
        else:
            response = self.wrapper.search_by_author(name)
            ids = set(author["author_id"] for author in response)
            response = [self.wrapper.get_author(author) for author in ids]

            if add:
                for author in response:
                    for book in author.get_bibliography():
                        books = self.get(what="book", name=book)

                        books = {title: book for title, book in books.items() \
                                 if book.get_author() == author}

                        # Match short book key name
                        sub_title = re.sub(r"\s+", " ", book[:20])
                        books = [book for key, book in books.items() if sub_title in key]

                        self._corpus.add_books(books)

            response = {f"{author.get_full_name()}": author for author in response}

    elif what.startswith("sh"):
        if name == "all":
            response = self.wrapper.all_bookshelves()
        else:
            response = self.wrapper.search_by_bookshelf(name)
            if add:
                add = [self.wrapper.get_book(book["book_id"]) for book in response]
                self._corpus.add_books(add)

            results = {}
            for shelf in response:
                book = f"{shelf['title']} [{shelf['author']}]"
                results[shelf['bookshelf']] = results.get(shelf['bookshelf'], [])

                if book not in results[shelf['bookshelf']]:
                    results[shelf['bookshelf']] += [book]

            response = results

    elif what.startswith("su"):
        if name == "all":
            response = self.wrapper.all_subjects()
        else:
            response = self.wrapper.search_by_subject(name)

            if add:
                add = [self.wrapper.get_book(book["book_id"]) for book in response]
                self._corpus.add_books(add)

            results = {}
            for subject in response:
                book = f"{subject['title']} [{subject['author']}]"
                results[subject['subject']] = results.get(subject['subject'], [])

                if book not in results[subject['subject']]:
                    results[subject['subject']] += [book]

            response = results
    else:
        warnings.warn("Not a valid option")
        print("Allowed arguments are:\nbook\nauthor\nshelf\nsubject")

    if not response:
        warnings.warn(f"No {what} found.\n"
                      "Please make sure there are no spelling mistakes.\n"
                      "You may search all available options by ignoring the [name] argument")
    return response

get_data_file(output_path, storage_type) classmethod

Get a content as defined in self.data_file and write into a file into output_path

Parameters:

Name Type Description Default
output_path Path

the path where to write (i.e. output_path = Path('WD/gutenberg/data/triplestore'))

required
storage_type str

the type of the storage

required

Returns:

Type Description
Path

the path to the file with data

Source code in dhtk/data_sources/gutenberg/__init__.py
@classmethod
def get_data_file(cls, output_path, storage_type):
    """
    Get a content as defined in self.data_file and write into a file into output_path
    Args:
        output_path (Path): the path where to write (i.e. output_path = Path('WD/gutenberg/data/triplestore'))
        storage_type (str): the type of the storage

    Returns:
        Path: the path to the file with data

    """
    #output_path can be like this output_path = 'WD/gutenberg/data/triplestore'
    #storage_type can be like this storage_type= 'triplestore'
    if isinstance(cls.storage_type, str):
        data_file = cls.data_file
    else:
        data_file = cls.storage_type[cls.storage_type.index(storage_type)]
    download_files(data_file, output_path, "gutenberg.ttl")
    return output_path / "gutenberg.ttl"

api special

corpus

GutenbergCorpus (Corpus)

Class to create a corpus from books of type dhtk.common.Book.

!!! notes The corpus can be created by a list of books. The list of books can be gotten by applying different filters,like searching a list of books by:

    - subject :class:`Data.search_by_subject()`,
    - author :class:`Data.search_by_author()`,
    - titre :class:`Data.search_by_title()`,
    - bookshelves :class:`Data.search_by_bookshelves()`.

Corpus has a name, description, a path to save and a list of selective books.
A single book or even a list of books can be added to the corpus created.
It is also possible to remove a book or all the books from corpus.
Download a single book or the whole books of the corpus in a local machine
can be also done by this class.

Examples:

import os >>> from pprint import pprint >>> from dhtk.data_sources.templates.corpus import Corpus >>> from dhtk.data_sources.gutenberg.api.data import GutenbergData # Initialise class GutenbergData as gutenberg_search. >>> gutenberg_search = GutenbergData() # Data the books by bookshelf and store the id of books in the list >>> books_found = gutenberg_search.search_by_author("Jane","Austen") >>> book_ids = [item['book_id'] for item in books_found] >>> books = set() # Get only the 4 first books found. >>> for book_id in book_ids[0:4]: >>> books.add(gutenberg_search.book_from_book_id(book_id)) # Create the corpus. >>> corpus = Corpus( >>> "jane_austen", >>> description="Books by Jane Austen", >>> corpora_path=os.path.expanduser("~/Desktop/"), >>> book_list=books >>> ) >>> corpus.print_book_list() # 0 Jane Austen Emma # 1 Jane Austen Gevoel en verstand # 2 Jane Austen Emma # 3 Jane Austen Lady Susan

Source code in dhtk/data_sources/gutenberg/api/corpus.py
class GutenbergCorpus(Corpus):
    """Class to create a corpus from books of type `dhtk.common.Book`.

    Notes:
        The corpus can be created by a list of books. The list of books can be gotten by applying different filters,like searching a list of books by:

            - subject :class:`Data.search_by_subject()`,
            - author :class:`Data.search_by_author()`,
            - titre :class:`Data.search_by_title()`,
            - bookshelves :class:`Data.search_by_bookshelves()`.

        Corpus has a name, description, a path to save and a list of selective books.
        A single book or even a list of books can be added to the corpus created.
        It is also possible to remove a book or all the books from corpus.
        Download a single book or the whole books of the corpus in a local machine
        can be also done by this class.

    Example:

    Args:

    Returns:

    >>> import os
            >>> from pprint import pprint
            >>> from dhtk.data_sources.templates.corpus import Corpus
            >>> from dhtk.data_sources.gutenberg.api.data import GutenbergData
            # Initialise class GutenbergData as gutenberg_search.
            >>> gutenberg_search = GutenbergData()
            # Data the books by bookshelf and store the id of books in the list
            >>> books_found = gutenberg_search.search_by_author("Jane","Austen")
            >>> book_ids = [item['book_id'] for item in books_found]
            >>> books = set()
            # Get only the 4 first books found.
            >>> for book_id in book_ids[0:4]:
                >>> books.add(gutenberg_search.book_from_book_id(book_id))
            # Create the corpus.
            >>> corpus = Corpus(
                >>> "jane_austen",
               >>>  description="Books by Jane Austen",
                >>> corpora_path=os.path.expanduser("~/Desktop/"),
                >>> book_list=books
            >>> )
            >>> corpus.print_book_list()
            # 0 Jane Austen Emma
            # 1 Jane Austen Gevoel en verstand
            # 2 Jane Austen Emma
            # 3 Jane Austen Lady Susan
    """

    # TODO: convertors: metadata -> sqllite | texts -" directory

    def __init__(self, name, working_directory, description="", corpora_path=None, book_list=None):
        """
        Initiate a corpus class
        Args:
            name (str): Name of the corpus.
            working_directory (str): The working directory path
            description (str,optional): A description of the corpus. Default to "".
            corpora_path (str,optional):  Path where the texts of the books in the corpus are saved. Default to None:
            book_list (list[dhtk.common.book.Book], optional:  A list of common.book.Book and/or child tools of it.Default to None:
        """

        self._name = name
        self._description = description

        # If there is a list of books, add the book to the list.
        if book_list:
            self._book_list = [book for book in book_list if book]
        # If  no list, create one (creating a set in order to avoid duplication).
        else:
            self._book_list = []

        # If any path exist, ask for a directory and create one.
        self._corpora_path = corpora_path
        if not corpora_path:
            self.path = pathlib.Path(working_directory) / "requests"

    # Settings
    def path(self, path=None):
        """Sets and returns the path containing the text files of the books in the corpus.

        Args:
          path(str, optional): the path dir. Default to None

        Returns:
          str: Path of corpus with the name of directory and the name of corpus.
          Examples:

        >>> corpus.path()
            # '~Desktop/jane_austen'
        """

        if path is not None:
            self._corpora_path = path
        name = self._name.split()
        return self._corpora_path / "_".join(name)

    def name(self, name=None):
        """

        Args:
          name(str, optional): The corpus file will be saved in the local machine with this name. (Default value = None)

        Returns:
          str: the name of corpus.

        Examples:
            >>> corpus.get_name()
            # 'jane_austen'
        """

        if name is not None:
            self._name = name

        return self._name

    def description(self, description=None):
        """Sets and returns the description of the corpus.

        Args:
          description(str, optional): The description of the corpus (Default value = None)

        Returns:
          str: The description of the corpus

        Examples:
            >>> corpus.description()
            # 'Books by Jane Austen'
        """

        if description is not None:
            self._description = description
        return self._description

    # Books
    def books(self, get=False, remove=False):
        """
        Returns the list of books in the corpus. Print list of books in the corpus.
        This list contains the number of books in corpus, authors' full name and the books title.

        Args:
          get: Default value = False)
          remove: Default value = False)

        Returns:

        """
        #TODO: fix the docstring
        """


        Args:
            get:
            remove:

        Returns:

        Examples:
            >>> pprint(corpus.get_book_list())
            # [<GutenbergBook: Jane Austen - Emma gutenberg_id: 158>,
            # <GutenbergBook: Jane Austen - Gevoel en verstand gutenberg_id: 25946>,
            # <GutenbergBook: Jane Austen - Emma gutenberg_id: 19839>,
            #  <GutenbergBook: Jane Austen - Lady Susan gutenberg_id: 22953>]

        """
        if get:
            books = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book for book in self._book_list
                     if book and (get == book.get_title() or  # Full title
                                  get == f"{book.get_title()[:20]} ({book.get_book_id_number()})" or  # Short title
                                  get == "all")}  # Accept all
            if remove:
                for book in books.values():
                    self.remove_book(book)
            else:
                if len(books) == 1:
                    books = list(books.values())[0]
                return books
        else:
            for index, book in enumerate(self._book_list):
                author = book.get_author()
                author_full_name = author.get_full_name()
                book_title = book.get_title()
                print("{} {} {}".format(index, author_full_name, book_title))

    def add_book(self, book):
        """Add a single book to the corpus.

        Args:
          book(dhtk.common.book.Book): The book instance`.

        Examples:
            Get a single book by gutenberg id.
                >>> len(corpus)
                # 4
                >>> book = gutenberg_search.book_from_book_id(book_ids[5])
                >>> corpus.add_book(book)
                >>> len(corpus)
                # 5
        """

        # If the object is a book, add it to the corpus that already exist.
        if book and book not in self._book_list:
            self._book_list.append(book)

        # if the object is not a book, send an error.
        # else:
        #     LOGGER.error("This is not a book.")

    def add_books(self, book_list):
        """Add a list of books to the corpus.

        Args:
          book_list(list: list: list[dhtk.common.book.Book]): A list of books instances

        Examples:
            >>> len(corpus)
            # 5
            >>> books = [gutenberg_search.book_from_book_id(book_id) for book_id in book_ids[6:10]]
            >>> corpus.add_books(books)
            >>> len(corpus)
            # 9
        """

        for book in book_list:
            self.add_book(book)

    @staticmethod
    def get_book_file_name(book):
        """Return a good filename for a book.

        Args:
          book (dhtk.common.book.Book): It is the book from searching in gutenberg and get book id using the method `book_from_book_id()`.

        Returns:

        """
        return book.get_text_file_name()

    def remove_book(self, book):
        """Delete a book form the corpus by deleting the book form list of books and also the file of book in the local machine.

        Args:
          book (dhtk.common.book.Book): The book to be removed

        Returns:

        """
        self._book_list.remove(book)
        file_path = self.path() / self.get_book_file_name(book)
        if file_path.is_file():
            file_path.unlink()

    def clear(self):
        """Delete all files and books in the corpus."""
        folder = pathlib.Path(self.path())

        if folder.is_dir():
            for content in folder.iterdir():
                content.unlink()
            folder.rmdir()

        self._book_list.clear()

    def download_book(self, book):
        """Download the text file for a single book and save it into the corpus repository

        Args:
          book ((dhtk.common.book.Book): The book whose text you need to download

        Returns:

        """

        corpus_path = self.path()
        if not corpus_path.is_dir():
            make_dirs(corpus_path)
        filename = book.get_text_file_name()
        path = corpus_path / filename

        if not path.is_file():
            book.repository().save_clean_text_file_to(corpus_path)
            sleep(5)
        # else:
        #     LOGGER.info("File %s already exists in %s.", filename, corpus_path)

    def download_corpus(self):
        """Download the text files for the full corpus to the corpus path directory.

        Examples:
            >>> corpus.download_corpus()
            >>> pprint(os.listdir(corpus.get_corpus_path()))
            # ['158-gutenberg.txt',
            #  '19839-gutenberg.txt',
            # '121-gutenberg.txt',
            # '22954-gutenberg.txt',
            # '1212-gutenberg.txt',
            # '25946-gutenberg.txt',
            # '22962-gutenberg.txt',
            # '22953-gutenberg.txt']

        """
        # Save the original text of books

        for book in self._book_list:
            self.download_book(book)

    def download_metadata(self, filename="books_metadata.csv"):
        """Download the metadata for the full corpus into a csv file

        Args:
            filename (str): the filename for the CSV file. Default to "books_metadata.csv"

        """
        # Save the original text of books

        corpus_path = self.path()
        if not corpus_path.is_dir():
            make_dirs(corpus_path)

        # Save book metadata
        pandas_table = self.to_pandas_dataframe()
        pandas_table.to_csv(path_or_buf=corpus_path / filename, index=False)

        # Save author metadata
        authors = set()
        for book in self._book_list:
            authors.add(book.get_author())
        authors_meta = {}
        for author in authors:
            authors_meta[author.get_full_name()] = author.to_dict()

        panda_author = pd.DataFrame.from_dict(authors_meta, orient="index")
        names = panda_author["name"]
        panda_author.drop(columns="name", inplace=True)
        panda_author.insert(0, 'name', names)
        panda_author.to_csv(path_or_buf=corpus_path / "authors_metadata.csv", index=False)

    def to_dict(self):
        """Convert to python dict for general purpose."""
        corpus_dict = dict()
        for index, book in enumerate(self._book_list):
            book_dict = book.to_dict()
            filename = book.get_text_file_name()
            file_path = self.path() / filename
            if file_path.is_file():
                book_dict["text_file_path"] = file_path
            corpus_dict[index] = book_dict
        return corpus_dict

    def to_pandas_dataframe(self):
        """Convert the list of books into a pandas.DataFrame."""
        import pandas as pd

        book_list = list()
        for book in self._book_list:
            book_dict = book.to_dict()
            filename = book.get_text_file_name()
            file_path = self.path() / filename
            if file_path.is_file():
                book_dict["text_file_path"] = file_path
            book_list.append(book_dict)
        return pd.DataFrame.from_dict(book_list)

    def __iter__(self):
        """
        Add capability to iterate over books in corpus.

        Returns:
            iterator (iter) : An iterator over the books in the corpuse's booklist.
        """
        for book in self._book_list:
            yield book

    def __len__(self):

        """
        List length.

        Returns:
            len (int): The number of books in the corpus.
        """
        return len(self._book_list)

    def __repr__(self):
        """
        Convert book_list in string format.

        Returns:
             str : A string of books in the list with information like the number of book in th list, author's name and the titre of book.

        """
        # Add padding to the authors'name in order to have a clean string
        max_author_name_len = max([
            len(book.get_author().get_full_name()) for book in self._book_list
        ]) + 4
        format_string = "{}\t{:" + str(max_author_name_len) + "}\t{}"

        text = [f"Corpus name:\n\t{self.name()}",
                f"Corpus description:\n\t{self.description()}",
                f"Corpus path:\n\t{self.path()}",
                "Corpus books:"]
        text += [format_string.format(
            i, book.get_author().get_full_name(), book.get_title()
        ) for i, book in enumerate(self._book_list)]

        return "\n".join(text)

    def __getitem__(self, item):
        """
        Return the item requested.

        Args:
            item (int): The index of the book in the corpus

        Returns:
            str: The book requested

        """
        return self._book_list[item]
__getitem__(self, item) special

Return the item requested.

Parameters:

Name Type Description Default
item int

The index of the book in the corpus

required

Returns:

Type Description
str

The book requested

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __getitem__(self, item):
    """
    Return the item requested.

    Args:
        item (int): The index of the book in the corpus

    Returns:
        str: The book requested

    """
    return self._book_list[item]
__init__(self, name, working_directory, description='', corpora_path=None, book_list=None) special

Initiate a corpus class

Parameters:

Name Type Description Default
name str

Name of the corpus.

required
working_directory str

The working directory path

required
description str,optional

A description of the corpus. Default to "".

''
corpora_path str,optional

Path where the texts of the books in the corpus are saved. Default to None:

None
book_list list[dhtk.common.book.Book]

A list of common.book.Book and/or child tools of it.Default to None:

None
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __init__(self, name, working_directory, description="", corpora_path=None, book_list=None):
    """
    Initiate a corpus class
    Args:
        name (str): Name of the corpus.
        working_directory (str): The working directory path
        description (str,optional): A description of the corpus. Default to "".
        corpora_path (str,optional):  Path where the texts of the books in the corpus are saved. Default to None:
        book_list (list[dhtk.common.book.Book], optional:  A list of common.book.Book and/or child tools of it.Default to None:
    """

    self._name = name
    self._description = description

    # If there is a list of books, add the book to the list.
    if book_list:
        self._book_list = [book for book in book_list if book]
    # If  no list, create one (creating a set in order to avoid duplication).
    else:
        self._book_list = []

    # If any path exist, ask for a directory and create one.
    self._corpora_path = corpora_path
    if not corpora_path:
        self.path = pathlib.Path(working_directory) / "requests"
__iter__(self) special

Add capability to iterate over books in corpus.

Returns:

Type Description
iterator (iter)

An iterator over the books in the corpuse's booklist.

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __iter__(self):
    """
    Add capability to iterate over books in corpus.

    Returns:
        iterator (iter) : An iterator over the books in the corpuse's booklist.
    """
    for book in self._book_list:
        yield book
__len__(self) special

List length.

Returns:

Type Description
len (int)

The number of books in the corpus.

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __len__(self):

    """
    List length.

    Returns:
        len (int): The number of books in the corpus.
    """
    return len(self._book_list)
__repr__(self) special

Convert book_list in string format.

Returns:

Type Description
str

A string of books in the list with information like the number of book in th list, author's name and the titre of book.

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __repr__(self):
    """
    Convert book_list in string format.

    Returns:
         str : A string of books in the list with information like the number of book in th list, author's name and the titre of book.

    """
    # Add padding to the authors'name in order to have a clean string
    max_author_name_len = max([
        len(book.get_author().get_full_name()) for book in self._book_list
    ]) + 4
    format_string = "{}\t{:" + str(max_author_name_len) + "}\t{}"

    text = [f"Corpus name:\n\t{self.name()}",
            f"Corpus description:\n\t{self.description()}",
            f"Corpus path:\n\t{self.path()}",
            "Corpus books:"]
    text += [format_string.format(
        i, book.get_author().get_full_name(), book.get_title()
    ) for i, book in enumerate(self._book_list)]

    return "\n".join(text)
add_book(self, book)

Add a single book to the corpus.

Parameters:

Name Type Description Default
book(dhtk.common.book.Book)

The book instance`.

required

Examples:

Get a single book by gutenberg id. >>> len(corpus) # 4 >>> book = gutenberg_search.book_from_book_id(book_ids[5]) >>> corpus.add_book(book) >>> len(corpus) # 5

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def add_book(self, book):
    """Add a single book to the corpus.

    Args:
      book(dhtk.common.book.Book): The book instance`.

    Examples:
        Get a single book by gutenberg id.
            >>> len(corpus)
            # 4
            >>> book = gutenberg_search.book_from_book_id(book_ids[5])
            >>> corpus.add_book(book)
            >>> len(corpus)
            # 5
    """

    # If the object is a book, add it to the corpus that already exist.
    if book and book not in self._book_list:
        self._book_list.append(book)

    # if the object is not a book, send an error.
    # else:
    #     LOGGER.error("This is not a book.")
add_books(self, book_list)

Add a list of books to the corpus.

Parameters:

Name Type Description Default
book_list(list

list: list[dhtk.common.book.Book]): A list of books instances

required

Examples:

>>> len(corpus)
# 5
>>> books = [gutenberg_search.book_from_book_id(book_id) for book_id in book_ids[6:10]]
>>> corpus.add_books(books)
>>> len(corpus)
# 9
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def add_books(self, book_list):
    """Add a list of books to the corpus.

    Args:
      book_list(list: list: list[dhtk.common.book.Book]): A list of books instances

    Examples:
        >>> len(corpus)
        # 5
        >>> books = [gutenberg_search.book_from_book_id(book_id) for book_id in book_ids[6:10]]
        >>> corpus.add_books(books)
        >>> len(corpus)
        # 9
    """

    for book in book_list:
        self.add_book(book)
books(self, get=False, remove=False)

Returns the list of books in the corpus. Print list of books in the corpus. This list contains the number of books in corpus, authors' full name and the books title.

Parameters:

Name Type Description Default
get

Default value = False)

False
remove

Default value = False)

False
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def books(self, get=False, remove=False):
    """
    Returns the list of books in the corpus. Print list of books in the corpus.
    This list contains the number of books in corpus, authors' full name and the books title.

    Args:
      get: Default value = False)
      remove: Default value = False)

    Returns:

    """
    #TODO: fix the docstring
    """


    Args:
        get:
        remove:

    Returns:

    Examples:
        >>> pprint(corpus.get_book_list())
        # [<GutenbergBook: Jane Austen - Emma gutenberg_id: 158>,
        # <GutenbergBook: Jane Austen - Gevoel en verstand gutenberg_id: 25946>,
        # <GutenbergBook: Jane Austen - Emma gutenberg_id: 19839>,
        #  <GutenbergBook: Jane Austen - Lady Susan gutenberg_id: 22953>]

    """
    if get:
        books = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book for book in self._book_list
                 if book and (get == book.get_title() or  # Full title
                              get == f"{book.get_title()[:20]} ({book.get_book_id_number()})" or  # Short title
                              get == "all")}  # Accept all
        if remove:
            for book in books.values():
                self.remove_book(book)
        else:
            if len(books) == 1:
                books = list(books.values())[0]
            return books
    else:
        for index, book in enumerate(self._book_list):
            author = book.get_author()
            author_full_name = author.get_full_name()
            book_title = book.get_title()
            print("{} {} {}".format(index, author_full_name, book_title))
clear(self)

Delete all files and books in the corpus.

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def clear(self):
    """Delete all files and books in the corpus."""
    folder = pathlib.Path(self.path())

    if folder.is_dir():
        for content in folder.iterdir():
            content.unlink()
        folder.rmdir()

    self._book_list.clear()
description(self, description=None)

Sets and returns the description of the corpus.

Parameters:

Name Type Description Default
description(str, optional

The description of the corpus (Default value = None)

required

Returns:

Type Description
str

The description of the corpus

Examples:

>>> corpus.description()
# 'Books by Jane Austen'
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def description(self, description=None):
    """Sets and returns the description of the corpus.

    Args:
      description(str, optional): The description of the corpus (Default value = None)

    Returns:
      str: The description of the corpus

    Examples:
        >>> corpus.description()
        # 'Books by Jane Austen'
    """

    if description is not None:
        self._description = description
    return self._description
download_book(self, book)

Download the text file for a single book and save it into the corpus repository

Parameters:

Name Type Description Default
book dhtk.common.book.Book

The book whose text you need to download

required
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def download_book(self, book):
    """Download the text file for a single book and save it into the corpus repository

    Args:
      book ((dhtk.common.book.Book): The book whose text you need to download

    Returns:

    """

    corpus_path = self.path()
    if not corpus_path.is_dir():
        make_dirs(corpus_path)
    filename = book.get_text_file_name()
    path = corpus_path / filename

    if not path.is_file():
        book.repository().save_clean_text_file_to(corpus_path)
        sleep(5)
    # else:
    #     LOGGER.info("File %s already exists in %s.", filename, corpus_path)
download_corpus(self)

Download the text files for the full corpus to the corpus path directory.

Examples:

>>> corpus.download_corpus()
>>> pprint(os.listdir(corpus.get_corpus_path()))
# ['158-gutenberg.txt',
#  '19839-gutenberg.txt',
# '121-gutenberg.txt',
# '22954-gutenberg.txt',
# '1212-gutenberg.txt',
# '25946-gutenberg.txt',
# '22962-gutenberg.txt',
# '22953-gutenberg.txt']
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def download_corpus(self):
    """Download the text files for the full corpus to the corpus path directory.

    Examples:
        >>> corpus.download_corpus()
        >>> pprint(os.listdir(corpus.get_corpus_path()))
        # ['158-gutenberg.txt',
        #  '19839-gutenberg.txt',
        # '121-gutenberg.txt',
        # '22954-gutenberg.txt',
        # '1212-gutenberg.txt',
        # '25946-gutenberg.txt',
        # '22962-gutenberg.txt',
        # '22953-gutenberg.txt']

    """
    # Save the original text of books

    for book in self._book_list:
        self.download_book(book)
download_metadata(self, filename='books_metadata.csv')

Download the metadata for the full corpus into a csv file

Parameters:

Name Type Description Default
filename str

the filename for the CSV file. Default to "books_metadata.csv"

'books_metadata.csv'
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def download_metadata(self, filename="books_metadata.csv"):
    """Download the metadata for the full corpus into a csv file

    Args:
        filename (str): the filename for the CSV file. Default to "books_metadata.csv"

    """
    # Save the original text of books

    corpus_path = self.path()
    if not corpus_path.is_dir():
        make_dirs(corpus_path)

    # Save book metadata
    pandas_table = self.to_pandas_dataframe()
    pandas_table.to_csv(path_or_buf=corpus_path / filename, index=False)

    # Save author metadata
    authors = set()
    for book in self._book_list:
        authors.add(book.get_author())
    authors_meta = {}
    for author in authors:
        authors_meta[author.get_full_name()] = author.to_dict()

    panda_author = pd.DataFrame.from_dict(authors_meta, orient="index")
    names = panda_author["name"]
    panda_author.drop(columns="name", inplace=True)
    panda_author.insert(0, 'name', names)
    panda_author.to_csv(path_or_buf=corpus_path / "authors_metadata.csv", index=False)
get_book_file_name(book) staticmethod

Return a good filename for a book.

Parameters:

Name Type Description Default
book dhtk.common.book.Book

It is the book from searching in gutenberg and get book id using the method book_from_book_id().

required
Source code in dhtk/data_sources/gutenberg/api/corpus.py
@staticmethod
def get_book_file_name(book):
    """Return a good filename for a book.

    Args:
      book (dhtk.common.book.Book): It is the book from searching in gutenberg and get book id using the method `book_from_book_id()`.

    Returns:

    """
    return book.get_text_file_name()
name(self, name=None)

Parameters:

Name Type Description Default
name(str, optional

The corpus file will be saved in the local machine with this name. (Default value = None)

required

Returns:

Type Description
str

the name of corpus.

Examples:

>>> corpus.get_name()
# 'jane_austen'
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def name(self, name=None):
    """

    Args:
      name(str, optional): The corpus file will be saved in the local machine with this name. (Default value = None)

    Returns:
      str: the name of corpus.

    Examples:
        >>> corpus.get_name()
        # 'jane_austen'
    """

    if name is not None:
        self._name = name

    return self._name
path(self, path=None)

Sets and returns the path containing the text files of the books in the corpus.

Parameters:

Name Type Description Default
path(str, optional

the path dir. Default to None

required

Returns:

Type Description
str

Path of corpus with the name of directory and the name of corpus. Examples:

corpus.path() # '~Desktop/jane_austen'

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def path(self, path=None):
    """Sets and returns the path containing the text files of the books in the corpus.

    Args:
      path(str, optional): the path dir. Default to None

    Returns:
      str: Path of corpus with the name of directory and the name of corpus.
      Examples:

    >>> corpus.path()
        # '~Desktop/jane_austen'
    """

    if path is not None:
        self._corpora_path = path
    name = self._name.split()
    return self._corpora_path / "_".join(name)
remove_book(self, book)

Delete a book form the corpus by deleting the book form list of books and also the file of book in the local machine.

Parameters:

Name Type Description Default
book dhtk.common.book.Book

The book to be removed

required
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def remove_book(self, book):
    """Delete a book form the corpus by deleting the book form list of books and also the file of book in the local machine.

    Args:
      book (dhtk.common.book.Book): The book to be removed

    Returns:

    """
    self._book_list.remove(book)
    file_path = self.path() / self.get_book_file_name(book)
    if file_path.is_file():
        file_path.unlink()
to_dict(self)

Convert to python dict for general purpose.

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def to_dict(self):
    """Convert to python dict for general purpose."""
    corpus_dict = dict()
    for index, book in enumerate(self._book_list):
        book_dict = book.to_dict()
        filename = book.get_text_file_name()
        file_path = self.path() / filename
        if file_path.is_file():
            book_dict["text_file_path"] = file_path
        corpus_dict[index] = book_dict
    return corpus_dict
to_pandas_dataframe(self)

Convert the list of books into a pandas.DataFrame.

Source code in dhtk/data_sources/gutenberg/api/corpus.py
def to_pandas_dataframe(self):
    """Convert the list of books into a pandas.DataFrame."""
    import pandas as pd

    book_list = list()
    for book in self._book_list:
        book_dict = book.to_dict()
        filename = book.get_text_file_name()
        file_path = self.path() / filename
        if file_path.is_file():
            book_dict["text_file_path"] = file_path
        book_list.append(book_dict)
    return pd.DataFrame.from_dict(book_list)

data

Contains the GutenbergData implementation of the abstract LiteraryData class

GutenbergData (Data)

Class to searching the Gutenberg catalog using SPARQL queries, inheriting from the Abstract class LiteraryData (dhtk.data_sources.abstract_gutenberg)

"Query" attributes participate to create a skeleton of a standard query : query_header + query_select + query_head.

Source code in dhtk/data_sources/gutenberg/api/data.py
class GutenbergData(Data):
    """Class to searching the Gutenberg catalog using SPARQL queries,
    inheriting from the Abstract class LiteraryData (dhtk.data_sources.abstract_gutenberg)

    "Query" attributes participate to create a skeleton of a standard query :
        query_header + query_select + query_head.

    Args:

    Returns:


    """
    # TODO: implement different types than text! '?book_id dcterms:type dcmitype:Text.'
    # TODO: add method to search book when author is known.

    _namespace = "\n".join([
        "PREFIX dcterms: <http://purl.org/dc/terms/>",
        "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>",
        "PREFIX purl: <http://purl.org/dc/terms/>",
        "PREFIX owl: <http://www.w3.org/2002/07/owl#>",
        "PREFIX pgterms: <http://www.gutenberg.org/2009/pgterms/>",
        "PREFIX foaf: <http://xmlns.com/foaf/0.1/>",
        "PREFIX marcrel: <http://id.loc.gov/vocabulary/relators/>",
        "PREFIX dcmitype: <http://purl.org/dc/dcmitype/>\n",
    ])

    _work_types = [
        "Text",
        "Image",
        "Sound",
        "Dataset",
        "StillImage",
        "Collection",
        "MovingImage",
    ]

    _search_cache = dict()

    _book_metadata = ("""<%s> rdf:type ?gutenberg_type . """,
                      "<%s> pgterms:downloads ?gutenberg_downloads .",
                      "<%s> dcterms:publisher ?gutenberg_publisher .",
                      "<%s> dcterms:hasFormat ?gutenberg_hasFormat .",
                      """<%s> dcterms:language [rdf:value ?gutenberg_language] .""",
                      """<%s> dcterms:subject  [rdf:valuowl:sameAse ?gutenberg_subject] .""",
                      """<%s> dcterms:type [rdf:value ?gutenberg_media_type] .""",
                      "<%s> dcterms:rights ?gutenberg_rights .",
                      "<%s> dcterms:title ?gutenberg_title .",
                      "<%s> dcterms:issued ?gutenberg_issued .",
                      "<%s> dcterms:creator ?gutenberg_creator .",
                      "<%s> dcterms:license ?gutenberg_license .",
                      "<%s> dcterms:tableOfContents ?gutenberg_tableOfContents .",
                      "<%s> pgterms:marc010 ?gutenberg_marc010 .",
                      "<%s> pgterms:marc901 ?gutenberg_marc901 .",
                      """<%s> pgterms:bookshelf [rdf:value ?gutenberg_bookshelf] .""",
                      "<%s> pgterms:marc440 ?gutenberg_marc440 .",
                      "<%s> dcterms:description ?gutenberg_description .",
                      "<%s> marcrel:trl ?gutenberg_trl .",
                      "<%s> dcterms:alternative ?gutenberg_alternative .",
                      "<%s> marcrel:edt ?gutenberg_edt .",
                      "<%s> marcrel:aui ?gutenberg_aui .",
                      "<%s> marcrel:pbl ?gutenberg_pbl .",
                      "<%s> marcrel:ill ?gutenberg_ill .",
                      "<%s> marcrel:cmm ?gutenberg_cmm .",
                      "<%s> marcrel:com ?gutenberg_com .",
                      "<%s> marcrel:oth ?gutenberg_oth .",
                      "<%s> pgterms:marc260 ?gutenberg_marc260 .",
                      "<%s> marcrel:ctb ?gutenberg_ctb .",
                      "<%s> marcrel:ann ?gutenberg_ann .",
                      "<%s> marcrel:egr ?gutenberg_egr .",
                      "<%s> pgterms:marc508 ?gutenberg_marc508 .",
                      "<%s> pgterms:marc546 ?gutenberg_marc546 .",
                      "<%s> pgterms:marc902 ?gutenberg_marc902 .",
                      "<%s> pgterms:marc520 ?gutenberg_marc520 .",
                      "<%s> pgterms:marc903 ?gutenberg_marc903 .",
                      "<%s> pgterms:marc300 ?gutenberg_marc300 .",
                      "<%s> marcrel:adp ?gutenberg_adp .",
                      "<%s> marcrel:pht ?gutenberg_pht .",
                      "<%s> marcrel:unk ?gutenberg_unk .",
                      "<%s> marcrel:prt ?gutenberg_prt .",
                      "<%s> marcrel:prf ?gutenberg_prf .",
                      "<%s> pgterms:marc250 ?gutenberg_marc250 .",
                      "<%s> pgterms:marc020 ?gutenberg_marc020 .",
                      "<%s> marcrel:cmp ?gutenberg_cmp .",
                      "<%s> marcrel:dub ?gutenberg_dub .",
                      "<%s> marcrel:arr ?gutenberg_arr .",
                      "<%s> marcrel:trc ?gutenberg_trc .",
                      "<%s> marcrel:clb ?gutenberg_clb .",
                      "<%s> marcrel:aft ?gutenberg_aft .",
                      "<%s> marcrel:res ?gutenberg_res .",
                      "<%s> marcrel:art ?gutenberg_art .",
                      "<%s> owl:sameAs|foaf:isPrimaryTopicOf ?same_as  .",)

    _author_metadata = ("<%s> pgterms:alias ?aliases .",
                        "<%s> pgterms:birthdate ?birth_date .",
                        "<%s> pgterms:deathdate ?death_date .",
                        "<%s> owl:sameAs|foaf:isPrimaryTopicOf ?same_as .",
                        "<%s> pgterms:webpage ?web_pages .",
                        "<%s> rdf:type ?gutenberg_type .")

    def __init__(self, sparql_endpoint):
        """Initialize tools with the SPARQL endpoint,
        such as an local instance of the Apache Jena Fuseki server.

        Args:
            sparql_endpoint (str) : URL of the triplet store containing Gutenberg Catalog triplets.
        """
        try:
            self._sparql_endpoint = SPARQLWrapper(sparql_endpoint)
        except Exception as error:
            raise EnvironmentError(
                f"Check the sparql_endpoint you provided!: {sparql_endpoint}"
            ) from error

        logger.info(f"GUTENBERG: GutenbergData instantiated using SPARQL endpoint: {sparql_endpoint}")

    # Bookshelves
    _shelves = """?book_id pgterms:bookshelf [dcterms:title ?bookshelf] ."""

    def all_bookshelves(self, select="SELECT DISTINCT ?bookshelf"):
        """Return all bookshelves in the store.

        This method doesn't use the standard SPARQL query, but a specific to get only bookshelves.

        Args:
          select (str) : SPARQL query   (Default value = "SELECT DISTINCT ?bookshelf")

        Returns:
            list : the results for the query

        """

        pattern = f"""
                WHERE {{
                    {self._shelves}
                }}
                ORDER BY ?bookshelf
                """
        query = self._namespace + select + pattern
        query = self._get_query_results(query)

        if "COUNT" in select:
            return query[0]["total"]
        return [result["bookshelf"] for result in query]

    def search_by_bookshelf(self, bookshelf):
        """Data in Gutenberg catalog all books corresponding to the given bookshelf string.

        The bookshelf is used as parameter in a SPARQL query.

        Args:
          bookshelf(str): bookshelf in plain text, case-insensitive. It can be a part of the bookshelf.

        Returns:
            str: result of the query


        """
        logger.info(f"GUTENBERG: Searching bookshelf: {bookshelf}")

        select = "SELECT DISTINCT *"
        pattern = f"""
            WHERE {{
                {self._shelves}
                FILTER CONTAINS(lcase(str(?bookshelf)), "{bookshelf.lower()}")
                {self._books}   
                OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}             
            }}
            ORDER BY ?author ?title
            """
        query = self._namespace + select + pattern
        return self._get_query_results(query)

    # Subjects
    _subjects = """?book_id dcterms:subject [dcterms:title ?subject]."""

    def all_subjects(self, select="SELECT DISTINCT ?subject"):
        """Return all subjects in the store.

        This method don't use the standard SPARQL query, but a specific to get only subjects.

        Args:
          select (str): The SPARQL query  (Default value = "SELECT DISTINCT ?subject")

        Returns:
            list : a list with the query results


        """
        pattern = f"""
            WHERE {{
                {self._subjects}
            }}
            ORDER BY ?subject
        """
        query = self._namespace + select + pattern
        query = self._get_query_results(query)

        if "COUNT" in select:
            return query[0]["total"]
        return [result["subject"] for result in query]

    def search_by_subject(self, subject, limit=0):
        """Data in Gutenberg catalog all books with given subject string.

        The subject is used as parameter in a SPARQL query. If no limit value is specified,
        the method returns all books with the given subject.

        Args:
          subject(str): Subject in plain text, case-insensitive. It can be a part of the subject.
          limit(int, optional): Use to limit how many books are returned by the SPARQL query. (Default value = 0)

        Returns:
            str : the query results


        """
        select = "SELECT DISTINCT *"
        pattern = f"""
                    WHERE {{
                        {self._subjects}
                        FILTER CONTAINS(lcase(str(?subject)), "{subject.lower()}")
                        {self._books}   
                        OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}          
                    }}
                    ORDER BY ?author ?title
                    """
        query = self._namespace + select + pattern
        if limit > 0:
            query += f"LIMIT {limit}"

        return self._get_query_results(query)

    # Authors
    _authors = """?author_id a	pgterms:agent;
                  pgterms:name|pgterms:alias ?author."""

    def all_authors(self, select="SELECT DISTINCT ?author"):
        """Return all authors in the store.

        This method don't use the standard SPARQL query, but a specific to get only authors.

        Args:
      select (str): The SPARQL query  (Default value = "SELECT DISTINCT ?author")

        Returns:
            list : the query results


        """

        pattern = f"""
            WHERE {{
                {self._authors}
            }}
            ORDER BY ?author
        """
        query = self._namespace + select + pattern
        if "COUNT" in select:
            return self._get_query_results(query)[0]["total"]
        return [result["author"] for result in self._get_query_results(query)]

    def search_by_author(self, name, alias=None):
        """
        Data books in the Gutenberg catalog by author's name and last name. The standard SPARQL query is overwritten by a filter and by a sort instruction.

        Args:
          name(str): Author's name, first name, last name or alias in plain text, case-insensitive. It can be a part of the author's name. (Default value = None)

        Returns:
            str : the query results


        """

        # If alias is provided, looks for name AND alias
        # Otherwise assume the name might be an alias

        names = re.split(r'\W+', name)
        filter = ""
        for name in names:
            filter += f'FILTER (CONTAINS(lcase(str(?author)), "{name.lower()}")'
            if alias is None:
                filter += f' ||  CONTAINS(lcase(str(?aliases)), "{name.lower()}")'
            filter += ")\n"
        if alias is not None:
            aliases = re.split(r'\W+', alias)
            for alias in aliases:
                filter += f'FILTER (CONTAINS(lcase(str(?aliases)), "{alias.lower()}"))\n'

        select = "SELECT DISTINCT ?author ?aliases ?author_id"
        pattern = f"""   
            WHERE {{
                {self._authors}
                {filter}
                OPTIONAL {{ ?author_id pgterms:alias ?aliases. }}
            }}
            ORDER BY ?author
            """

        query = self._namespace + select + pattern

        return self._get_query_results(query)

    def get_author(self, id):
        """Create an author object with information collected from the Gutenberg Store.

        Args:
          id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'

        Returns:
            str : the query results


        """

        select = "SELECT DISTINCT *"
        pattern = f"""
            WHERE {{
                <{id}> pgterms:name ?name . 
            }}
        """
        query = self._namespace + select + pattern
        query = self._get_query_results(query)[0]

        author = GutenbergAuthor(
            gutenberg_id=id,
            name=query["name"],
        )
        author.update_metadata(self.get_metadata(author))
        author.update_metadata({"bibliography": self.get_bibliography(id)})
        return author

    def get_bibliography(self, id):
        """To get all books written by an author.

        Args:
          id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'

        Returns:
            list : the query results


        """
        select = "SELECT DISTINCT ?title"
        pattern = f"""
            WHERE {{
                ?book_id purl:creator <{id}> .
                {self._books}
            }}
            ORDER BY ?title
        """

        query = self._namespace + select + pattern
        return [result["title"] for result in self._get_query_results(query)]

    # Books
    _books = f"""?book_id purl:title ?title.
                {_authors}
                """

    def all_books(self, select="SELECT DISTINCT ?title ?author"):
        """Return the title of all books in the store.

        This method don't use the standard query, but a specific to get only
        titles and book identifiers.

        Args:
          select:  (Default value = "SELECT DISTINCT ?title ?author")

        Returns:
            list : the query results


        """
        pattern = f"""
            WHERE {{
                {self._books}
            }}
            ORDER BY ?title
          """
        query = self._namespace + select + pattern
        query = self._get_query_results(query)

        if "COUNT" in select:
            return query[0]["total"]
        return [{"title": result['title'], "author": result['author']} for result in query]

    def search_by_title(self, title):
        """Data in Gutenberg catalog all books with given title string.

        Args:
          title(str): Title in plain text, case-insensitive. It can be a part of the title.

        Returns:
            str : the query results


        """
        select = "SELECT DISTINCT ?book_id ?title ?author_id ?author ?language"
        pattern = f"""
            WHERE {{
              {self._books}
              OPTIONAL {{?book_id dcterms:language [rdf:value ?language].}}
              FILTER CONTAINS(lcase(str(?title)), {repr(title).lower()})
            }}
            ORDER BY ?author ?title
            """
        query = self._namespace + select + pattern
        return self._get_query_results(query)

    def get_book(self, book_id, author=None):
        """Create a book object with information collected from the Gutenberg Store.

        Args:
          book_id(str): The book identifier is a URI, like 'http://www.gutenberg.org/ebooks/20063'
          author:  (Default value = None)

        Returns:
            gutenberg.tools.book :  A book object

        """

        select = "SELECT DISTINCT *"
        pattern = f"""
            WHERE {{
                <{book_id}> purl:title ?title; 
                    purl:creator ?author_id;
                    dcterms:type dcmitype:Text.          
            }}"""

        query = self._namespace + select + pattern
        book = self._get_query_results(query)[0]

        if not author:
            author = self.get_author(book["author_id"])

        metadata = self.bookshelves_subjects(book_id)

        book = GutenbergBook(
            gutenberg_id=book_id,
            title=book["title"],
            subject=metadata["subjects"] or None,
            bookshelf=metadata["bookshelves"] or None,
            author=author
        )
        book.update_metadata(self.get_metadata(book))
        return book

    def bookshelves_subjects(self, book_id):
        """Return the bookshelves and the subjects of the given book, designated by his identifier.

        Args:
          book_id(str): A Gutenberg book identifier. Is a URI, like "http://www.gutenberg.org/ebooks/10053"

        Returns:
            dict : the query results with 'bookshelves' and 'subjects'


        """
        select = "SELECT DISTINCT ?subject ?bookshelf"
        pattern = f"""
            WHERE {{
                <{book_id}> dcterms:subject [dcterms:title ?subject];
                    pgterms:bookshelf [dcterms:title ?bookshelf] ;
                    dcterms:type dcmitype:Text.                             
            }}
            ORDER BY ?subject
            """
        query = self._namespace + select + pattern
        query = self._get_query_results(query)

        subjects = [result["subject"] for result in query]
        bookshelves = [result["bookshelf"] for result in query]

        return {"bookshelves": set(bookshelves), "subjects": set(subjects)}

    # Queries
    def _get_query_results(self, query):
        """Use a SPARQL query to get results from the triplet store.

        Args:
          query(str): A structured string in the SPARQL language used to ask the triplet store.

        Returns:
            list : the query results


        """
        logger.debug(f"GUTENBERG: Executing query: \n{query}")

        sparql = self._sparql_endpoint

        sparql.setQuery(query)
        logger.debug(query)
        sparql.setReturnFormat(JSON)
        # TODO: handle remote server better than this, please:
        query_results = sparql.queryAndConvert()

        results = list()
        for entry in query_results["results"]["bindings"]:
            formatted_entry = dict()
            for key, value in entry.items():
                formatted_entry[key] = value["value"]
            results.append(formatted_entry)
        return results

    def get_metadata(self, item):
        """Get metadata about the book that is present in the catalog.

        Args:
          item(An object having an entry "gutenberg_id" in the results of the method get_metadata().): The metadata of these tools must contain an entry called: "gutenberg_id"

        Returns:
            dict : the query results


        """

        query_results = self._metadata_query(item)

        metadata = dict()
        for result in query_results:
            result_count = len(result)
            if result_count == 1:
                for key in result[0].keys():
                    metadata[key] = result[0][key]
            elif result_count > 1:
                for key in result[0].keys():
                    metadata[key] = [entry[key] for entry in result]

        return metadata

    def _metadata_query(self, item):
        """Helper function to get metadata for different item types.

        Args:
          item(An object having an entry "gutenberg_id" in the results of the method get_metadata().): The metadata of these tools must contain an entry called: "gutenberg_id"

        Returns:
            list : the query results

        TODO: fix this! One query should be sufficent.
        """
        query = self._namespace
        query += """ SELECT DISTINCT *
            WHERE {
            """
        query_results = []
        if isinstance(item, GutenbergBook):
            book_id = item.get_book_id()
            for metadata in self._book_metadata:
                query_results.append(self._get_query_results(query + metadata % book_id + "}"))
            return query_results

        if isinstance(item, GutenbergAuthor):
            author_id = item.get_gutenberg_id()
            for metadata in self._author_metadata:
                query_results.append(self._get_query_results(query + metadata % author_id + "}"))
        return query_results

    def statistics(self):
        """Print information about the Gutenberg catalog.

        Args:

        Returns:
          str: Formatted string of different statistics. Subject counts sub-subjects too.

        Notes:
            This method is relatively slow due to the fact that it inspects the whole Gutenberg RDF.

        Examples:

            >>> gutenberg_data.statistics()
            # number of books        :     60101
            # number of authors      :     20908
            # number of bookshelves  :       335
            # number of subjects     :     17524
        """
        statistics = dict()
        statistics["number_of_books"] = self.all_books(select="SELECT (COUNT(DISTINCT ?title) as ?total)")
        statistics["number_of_authors"] = self.all_authors(select="SELECT (COUNT(DISTINCT ?author) as ?total)")
        statistics["number_of_bookshelves"] = self.all_bookshelves(
            select="SELECT (COUNT(DISTINCT ?bookshelf) as ?total)")
        statistics["number_of_subjects"] = self.all_subjects(select="SELECT (COUNT(DISTINCT ?subject) as ?total)")
        text = ""
        for key, value in statistics.items():
            text += f"\n{key.replace('_', ' '):23}:\t {value:>5}"

        return text
__init__(self, sparql_endpoint) special

Initialize tools with the SPARQL endpoint, such as an local instance of the Apache Jena Fuseki server.

Parameters:

Name Type Description Default
sparql_endpoint str)

URL of the triplet store containing Gutenberg Catalog triplets.

required
Source code in dhtk/data_sources/gutenberg/api/data.py
def __init__(self, sparql_endpoint):
    """Initialize tools with the SPARQL endpoint,
    such as an local instance of the Apache Jena Fuseki server.

    Args:
        sparql_endpoint (str) : URL of the triplet store containing Gutenberg Catalog triplets.
    """
    try:
        self._sparql_endpoint = SPARQLWrapper(sparql_endpoint)
    except Exception as error:
        raise EnvironmentError(
            f"Check the sparql_endpoint you provided!: {sparql_endpoint}"
        ) from error

    logger.info(f"GUTENBERG: GutenbergData instantiated using SPARQL endpoint: {sparql_endpoint}")
all_authors(self, select='SELECT DISTINCT ?author')

Return all authors in the store.

This method don't use the standard SPARQL query, but a specific to get only authors.

Args: select (str): The SPARQL query (Default value = "SELECT DISTINCT ?author")

!!! returns list : the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def all_authors(self, select="SELECT DISTINCT ?author"):
    """Return all authors in the store.

    This method don't use the standard SPARQL query, but a specific to get only authors.

    Args:
  select (str): The SPARQL query  (Default value = "SELECT DISTINCT ?author")

    Returns:
        list : the query results


    """

    pattern = f"""
        WHERE {{
            {self._authors}
        }}
        ORDER BY ?author
    """
    query = self._namespace + select + pattern
    if "COUNT" in select:
        return self._get_query_results(query)[0]["total"]
    return [result["author"] for result in self._get_query_results(query)]
all_books(self, select='SELECT DISTINCT ?title ?author')

Return the title of all books in the store.

This method don't use the standard query, but a specific to get only titles and book identifiers.

Parameters:

Name Type Description Default
select

(Default value = "SELECT DISTINCT ?title ?author")

'SELECT DISTINCT ?title ?author'

Returns:

Type Description
list

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def all_books(self, select="SELECT DISTINCT ?title ?author"):
    """Return the title of all books in the store.

    This method don't use the standard query, but a specific to get only
    titles and book identifiers.

    Args:
      select:  (Default value = "SELECT DISTINCT ?title ?author")

    Returns:
        list : the query results


    """
    pattern = f"""
        WHERE {{
            {self._books}
        }}
        ORDER BY ?title
      """
    query = self._namespace + select + pattern
    query = self._get_query_results(query)

    if "COUNT" in select:
        return query[0]["total"]
    return [{"title": result['title'], "author": result['author']} for result in query]
all_bookshelves(self, select='SELECT DISTINCT ?bookshelf')

Return all bookshelves in the store.

This method doesn't use the standard SPARQL query, but a specific to get only bookshelves.

Parameters:

Name Type Description Default
select str)

SPARQL query (Default value = "SELECT DISTINCT ?bookshelf")

'SELECT DISTINCT ?bookshelf'

Returns:

Type Description
list

the results for the query

Source code in dhtk/data_sources/gutenberg/api/data.py
def all_bookshelves(self, select="SELECT DISTINCT ?bookshelf"):
    """Return all bookshelves in the store.

    This method doesn't use the standard SPARQL query, but a specific to get only bookshelves.

    Args:
      select (str) : SPARQL query   (Default value = "SELECT DISTINCT ?bookshelf")

    Returns:
        list : the results for the query

    """

    pattern = f"""
            WHERE {{
                {self._shelves}
            }}
            ORDER BY ?bookshelf
            """
    query = self._namespace + select + pattern
    query = self._get_query_results(query)

    if "COUNT" in select:
        return query[0]["total"]
    return [result["bookshelf"] for result in query]
all_subjects(self, select='SELECT DISTINCT ?subject')

Return all subjects in the store.

This method don't use the standard SPARQL query, but a specific to get only subjects.

Parameters:

Name Type Description Default
select str

The SPARQL query (Default value = "SELECT DISTINCT ?subject")

'SELECT DISTINCT ?subject'

Returns:

Type Description
list

a list with the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def all_subjects(self, select="SELECT DISTINCT ?subject"):
    """Return all subjects in the store.

    This method don't use the standard SPARQL query, but a specific to get only subjects.

    Args:
      select (str): The SPARQL query  (Default value = "SELECT DISTINCT ?subject")

    Returns:
        list : a list with the query results


    """
    pattern = f"""
        WHERE {{
            {self._subjects}
        }}
        ORDER BY ?subject
    """
    query = self._namespace + select + pattern
    query = self._get_query_results(query)

    if "COUNT" in select:
        return query[0]["total"]
    return [result["subject"] for result in query]
bookshelves_subjects(self, book_id)

Return the bookshelves and the subjects of the given book, designated by his identifier.

Parameters:

Name Type Description Default
book_id(str)

A Gutenberg book identifier. Is a URI, like "http://www.gutenberg.org/ebooks/10053"

required

Returns:

Type Description
dict

the query results with 'bookshelves' and 'subjects'

Source code in dhtk/data_sources/gutenberg/api/data.py
def bookshelves_subjects(self, book_id):
    """Return the bookshelves and the subjects of the given book, designated by his identifier.

    Args:
      book_id(str): A Gutenberg book identifier. Is a URI, like "http://www.gutenberg.org/ebooks/10053"

    Returns:
        dict : the query results with 'bookshelves' and 'subjects'


    """
    select = "SELECT DISTINCT ?subject ?bookshelf"
    pattern = f"""
        WHERE {{
            <{book_id}> dcterms:subject [dcterms:title ?subject];
                pgterms:bookshelf [dcterms:title ?bookshelf] ;
                dcterms:type dcmitype:Text.                             
        }}
        ORDER BY ?subject
        """
    query = self._namespace + select + pattern
    query = self._get_query_results(query)

    subjects = [result["subject"] for result in query]
    bookshelves = [result["bookshelf"] for result in query]

    return {"bookshelves": set(bookshelves), "subjects": set(subjects)}
get_author(self, id)

Create an author object with information collected from the Gutenberg Store.

Parameters:

Name Type Description Default
id(str)

The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'

required

Returns:

Type Description
str

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def get_author(self, id):
    """Create an author object with information collected from the Gutenberg Store.

    Args:
      id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'

    Returns:
        str : the query results


    """

    select = "SELECT DISTINCT *"
    pattern = f"""
        WHERE {{
            <{id}> pgterms:name ?name . 
        }}
    """
    query = self._namespace + select + pattern
    query = self._get_query_results(query)[0]

    author = GutenbergAuthor(
        gutenberg_id=id,
        name=query["name"],
    )
    author.update_metadata(self.get_metadata(author))
    author.update_metadata({"bibliography": self.get_bibliography(id)})
    return author
get_bibliography(self, id)

To get all books written by an author.

Parameters:

Name Type Description Default
id(str)

The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'

required

Returns:

Type Description
list

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def get_bibliography(self, id):
    """To get all books written by an author.

    Args:
      id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'

    Returns:
        list : the query results


    """
    select = "SELECT DISTINCT ?title"
    pattern = f"""
        WHERE {{
            ?book_id purl:creator <{id}> .
            {self._books}
        }}
        ORDER BY ?title
    """

    query = self._namespace + select + pattern
    return [result["title"] for result in self._get_query_results(query)]
get_book(self, book_id, author=None)

Create a book object with information collected from the Gutenberg Store.

Parameters:

Name Type Description Default
book_id(str)

The book identifier is a URI, like 'http://www.gutenberg.org/ebooks/20063'

required
author

(Default value = None)

None

Returns:

Type Description
gutenberg.tools.book

A book object

Source code in dhtk/data_sources/gutenberg/api/data.py
def get_book(self, book_id, author=None):
    """Create a book object with information collected from the Gutenberg Store.

    Args:
      book_id(str): The book identifier is a URI, like 'http://www.gutenberg.org/ebooks/20063'
      author:  (Default value = None)

    Returns:
        gutenberg.tools.book :  A book object

    """

    select = "SELECT DISTINCT *"
    pattern = f"""
        WHERE {{
            <{book_id}> purl:title ?title; 
                purl:creator ?author_id;
                dcterms:type dcmitype:Text.          
        }}"""

    query = self._namespace + select + pattern
    book = self._get_query_results(query)[0]

    if not author:
        author = self.get_author(book["author_id"])

    metadata = self.bookshelves_subjects(book_id)

    book = GutenbergBook(
        gutenberg_id=book_id,
        title=book["title"],
        subject=metadata["subjects"] or None,
        bookshelf=metadata["bookshelves"] or None,
        author=author
    )
    book.update_metadata(self.get_metadata(book))
    return book
get_metadata(self, item)

Get metadata about the book that is present in the catalog.

Parameters:

Name Type Description Default
item(An object having an entry "gutenberg_id" in the results of the method get_metadata().

The metadata of these tools must contain an entry called: "gutenberg_id"

required

Returns:

Type Description
dict

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def get_metadata(self, item):
    """Get metadata about the book that is present in the catalog.

    Args:
      item(An object having an entry "gutenberg_id" in the results of the method get_metadata().): The metadata of these tools must contain an entry called: "gutenberg_id"

    Returns:
        dict : the query results


    """

    query_results = self._metadata_query(item)

    metadata = dict()
    for result in query_results:
        result_count = len(result)
        if result_count == 1:
            for key in result[0].keys():
                metadata[key] = result[0][key]
        elif result_count > 1:
            for key in result[0].keys():
                metadata[key] = [entry[key] for entry in result]

    return metadata
search_by_author(self, name, alias=None)

Data books in the Gutenberg catalog by author's name and last name. The standard SPARQL query is overwritten by a filter and by a sort instruction.

Parameters:

Name Type Description Default
name(str)

Author's name, first name, last name or alias in plain text, case-insensitive. It can be a part of the author's name. (Default value = None)

required

Returns:

Type Description
str

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_author(self, name, alias=None):
    """
    Data books in the Gutenberg catalog by author's name and last name. The standard SPARQL query is overwritten by a filter and by a sort instruction.

    Args:
      name(str): Author's name, first name, last name or alias in plain text, case-insensitive. It can be a part of the author's name. (Default value = None)

    Returns:
        str : the query results


    """

    # If alias is provided, looks for name AND alias
    # Otherwise assume the name might be an alias

    names = re.split(r'\W+', name)
    filter = ""
    for name in names:
        filter += f'FILTER (CONTAINS(lcase(str(?author)), "{name.lower()}")'
        if alias is None:
            filter += f' ||  CONTAINS(lcase(str(?aliases)), "{name.lower()}")'
        filter += ")\n"
    if alias is not None:
        aliases = re.split(r'\W+', alias)
        for alias in aliases:
            filter += f'FILTER (CONTAINS(lcase(str(?aliases)), "{alias.lower()}"))\n'

    select = "SELECT DISTINCT ?author ?aliases ?author_id"
    pattern = f"""   
        WHERE {{
            {self._authors}
            {filter}
            OPTIONAL {{ ?author_id pgterms:alias ?aliases. }}
        }}
        ORDER BY ?author
        """

    query = self._namespace + select + pattern

    return self._get_query_results(query)
search_by_bookshelf(self, bookshelf)

Data in Gutenberg catalog all books corresponding to the given bookshelf string.

The bookshelf is used as parameter in a SPARQL query.

Parameters:

Name Type Description Default
bookshelf(str)

bookshelf in plain text, case-insensitive. It can be a part of the bookshelf.

required

Returns:

Type Description
str

result of the query

Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_bookshelf(self, bookshelf):
    """Data in Gutenberg catalog all books corresponding to the given bookshelf string.

    The bookshelf is used as parameter in a SPARQL query.

    Args:
      bookshelf(str): bookshelf in plain text, case-insensitive. It can be a part of the bookshelf.

    Returns:
        str: result of the query


    """
    logger.info(f"GUTENBERG: Searching bookshelf: {bookshelf}")

    select = "SELECT DISTINCT *"
    pattern = f"""
        WHERE {{
            {self._shelves}
            FILTER CONTAINS(lcase(str(?bookshelf)), "{bookshelf.lower()}")
            {self._books}   
            OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}             
        }}
        ORDER BY ?author ?title
        """
    query = self._namespace + select + pattern
    return self._get_query_results(query)
search_by_subject(self, subject, limit=0)

Data in Gutenberg catalog all books with given subject string.

The subject is used as parameter in a SPARQL query. If no limit value is specified, the method returns all books with the given subject.

Parameters:

Name Type Description Default
subject(str)

Subject in plain text, case-insensitive. It can be a part of the subject.

required
limit(int, optional

Use to limit how many books are returned by the SPARQL query. (Default value = 0)

required

Returns:

Type Description
str

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_subject(self, subject, limit=0):
    """Data in Gutenberg catalog all books with given subject string.

    The subject is used as parameter in a SPARQL query. If no limit value is specified,
    the method returns all books with the given subject.

    Args:
      subject(str): Subject in plain text, case-insensitive. It can be a part of the subject.
      limit(int, optional): Use to limit how many books are returned by the SPARQL query. (Default value = 0)

    Returns:
        str : the query results


    """
    select = "SELECT DISTINCT *"
    pattern = f"""
                WHERE {{
                    {self._subjects}
                    FILTER CONTAINS(lcase(str(?subject)), "{subject.lower()}")
                    {self._books}   
                    OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}          
                }}
                ORDER BY ?author ?title
                """
    query = self._namespace + select + pattern
    if limit > 0:
        query += f"LIMIT {limit}"

    return self._get_query_results(query)
search_by_title(self, title)

Data in Gutenberg catalog all books with given title string.

Parameters:

Name Type Description Default
title(str)

Title in plain text, case-insensitive. It can be a part of the title.

required

Returns:

Type Description
str

the query results

Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_title(self, title):
    """Data in Gutenberg catalog all books with given title string.

    Args:
      title(str): Title in plain text, case-insensitive. It can be a part of the title.

    Returns:
        str : the query results


    """
    select = "SELECT DISTINCT ?book_id ?title ?author_id ?author ?language"
    pattern = f"""
        WHERE {{
          {self._books}
          OPTIONAL {{?book_id dcterms:language [rdf:value ?language].}}
          FILTER CONTAINS(lcase(str(?title)), {repr(title).lower()})
        }}
        ORDER BY ?author ?title
        """
    query = self._namespace + select + pattern
    return self._get_query_results(query)
statistics(self)

Print information about the Gutenberg catalog.

Returns:

Type Description
str

Formatted string of different statistics. Subject counts sub-subjects too.

!!! notes This method is relatively slow due to the fact that it inspects the whole Gutenberg RDF.

Examples:

>>> gutenberg_data.statistics()
# number of books        :     60101
# number of authors      :     20908
# number of bookshelves  :       335
# number of subjects     :     17524
Source code in dhtk/data_sources/gutenberg/api/data.py
def statistics(self):
    """Print information about the Gutenberg catalog.

    Args:

    Returns:
      str: Formatted string of different statistics. Subject counts sub-subjects too.

    Notes:
        This method is relatively slow due to the fact that it inspects the whole Gutenberg RDF.

    Examples:

        >>> gutenberg_data.statistics()
        # number of books        :     60101
        # number of authors      :     20908
        # number of bookshelves  :       335
        # number of subjects     :     17524
    """
    statistics = dict()
    statistics["number_of_books"] = self.all_books(select="SELECT (COUNT(DISTINCT ?title) as ?total)")
    statistics["number_of_authors"] = self.all_authors(select="SELECT (COUNT(DISTINCT ?author) as ?total)")
    statistics["number_of_bookshelves"] = self.all_bookshelves(
        select="SELECT (COUNT(DISTINCT ?bookshelf) as ?total)")
    statistics["number_of_subjects"] = self.all_subjects(select="SELECT (COUNT(DISTINCT ?subject) as ?total)")
    text = ""
    for key, value in statistics.items():
        text += f"\n{key.replace('_', ' '):23}:\t {value:>5}"

    return text

data special

builder

get_metadata(self)

Parameters:

Name Type Description Default
self required
Source code in dhtk/data_sources/gutenberg/data/builder.py
def get_metadata(self):
    """

    Args:
        self:

    Returns:

    """
    self.MANAGERS.SYSTEM.download_files(self.data_file,
                                        configuration={"file": "downloads", "setting": "gutenberg"},
                                        desc="Downloading Gutenberg.org metadata")

    self.graph, log = self.MANAGERS.SYSTEM.unpack_archive(g=self.graph,
                                                          configuration={"file": "gutenberg", "setting": "books"},
                                                          desc="Unpacking Gutenberg.org metadata",
                                                          archive_type="r:bz2")

    self._process_graph(log)
    self.MANAGERS.SYSTEM.serialize(self.graph, len(self.MANAGERS.SYSTEM.unpacked_files), file_name=self.master)
owl_test()

Trying to automatically extract class information from triples

check: https://gitlab.com/neves.ces/bnf_project/-/blob/d3d2d1b40d7c6243481c4bf97bc48184e358d905/Gutenberg_process.ipynb

Source code in dhtk/data_sources/gutenberg/data/builder.py
def owl_test():
    """Trying to automatically extract class information from triples

    check:
    https://gitlab.com/neves.ces/bnf_project/-/blob/d3d2d1b40d7c6243481c4bf97bc48184e358d905/Gutenberg_process.ipynb

    Args:

    Returns:

    """
    import owlready2
    onto = owlready2.get_ontology(base_iri="master.rdf")
    onto._load()
    get_ontology("master.rdf#")

    with onto:
        class Drug(owlready2.Thing):
            """ """
            pass

    # Class constructors
    def constructor(self, arg):
        """

        Args:
          arg: 

        Returns:

        """
        self.constructor_arg = arg

    def define_class(name, arg):
        """

        Args:
          name: 
          arg: 

        Returns:

        """
        with onto:
            type(name, (owlready2.Thing,), {"__init__": constructor})

    define_class("test", 1)
    list(onto.classes())
serialize(self, graph, n_files, file_name='master.rdf')

Parameters:

Name Type Description Default
graph required
n_files required
file_name

(Default value = "master.rdf")

'master.rdf'
Source code in dhtk/data_sources/gutenberg/data/builder.py
def serialize(self, graph, n_files, file_name="master.rdf"):
    """

    Args:
      graph: 
      n_files: 
      file_name:  (Default value = "master.rdf")

    Returns:

    """
    print("{} triples merged from {} RDF graphs".format(len(graph), n_files))

    graph.serialize(destination=str(self.wd / file_name), format='xml', encoding="utf-8")

    print("Saved to {}".format(self.wd / file_name))
unpack_archive(self, g=<Graph identifier=N706aa1ef48b34bbaa288f4a3b27fad93 (<class 'rdflib.graph.Graph'>)>, configuration=None, desc='Unpacking archive', archive_type='r:bz2')

Parameters:

Name Type Description Default
g rdflib.Graph

The graph where unpack (Default value = rdflib.Graph())

<Graph identifier=N706aa1ef48b34bbaa288f4a3b27fad93 (<class 'rdflib.graph.Graph'>)>
configuration str

Configuration settings (Default value = None)

None
desc str

The description of the work (Default value = "Unpacking archive")

'Unpacking archive'
archive_type str

the suffix extension (Default value = "r:bz2")

'r:bz2'

Returns:

Type Description
g (rdflib.Graph)

The graph log ( logging): The log file

Source code in dhtk/data_sources/gutenberg/data/builder.py
def unpack_archive(self, g=rdflib.Graph(), configuration=None, desc="Unpacking archive", archive_type="r:bz2"):
    """

    Args:
      g (rdflib.Graph): The graph where unpack  (Default value = rdflib.Graph())
      configuration (str): Configuration settings   (Default value = None)
      desc (str): The description of the work  (Default value = "Unpacking archive")
      archive_type (str): the suffix extension   (Default value = "r:bz2")

    Returns:
        g (rdflib.Graph): The graph
        log ( logging): The log file

    """
    if configuration is None:
        configuration = {"file": "downloads", "setting": "files"}

    # Capture rdflib.graph.parse logs to identify broken URIs
    logger = logging.getLogger("rdflib.term")
    log_capture_string = io.StringIO()
    ch = logging.StreamHandler(log_capture_string)
    logger.addHandler(ch)

    file_names = []

    for archive in self.last_download:
        print("Opening {}".format(archive))
        tar = tarfile.open(archive, archive_type)
        files = tar.getmembers()

        for member in tqdm.tqdm(files, desc=desc):
            member.name = os.path.basename(member.name)  # Cleans the folder structure
            tar.extract(member, self.wd)

            g = self._merge_graphs(self.wd / member.name, g)

            file_names.append(member.name)

        tar.close()
        print("Removing {} ".format(archive))
        subprocess.check_call(["rm", archive])
    log = log_capture_string.getvalue()

    # Save to config
    self.MANAGERS.CONFIG.add_record(configuration, value=file_names, unique=True)
    self.unpacked_files = file_names

    return g, log

tools special

author

Contains the GutenbergAuthor Class.

GutenbergAuthor (Author)

The class for an author extended with variables from the gutenberg project. Child of the generic Author class.

Example
from pprint import pprint
from  dhtk.data_sources.gutenberg.author import GutenbergAuthor

# Create an author manually.
adam_smith_author = GutenbergAuthor(
    gutenberg_id='http://www.gutenberg.org/2009/agents/1158',
    name='Smith, Adam'
)

# create an author form the gutenberg repository

from dhtk.data_sources.gutenberg.data import GutenbergData

database = GutenbergData()
database.search_author_by_name("Adam", "Smith")
# [('Smith, Adam', 'http://www.gutenberg.org/2009/agents/1158'),
#  ('Smith, George Adam', 'http://www.gutenberg.org/2009/agents/5016')]

adam_smith_author_pg = database.author_from_author_id(
    'http://www.gutenberg.org/2009/agents/1158'
)

adam_smith_author_pg.print_info()
# Adam Smith
# Metadata    :
#     - gutenberg_id: http://www.gutenberg.org/2009/agents/1158
#     - id          : http://www.gutenberg.org/2009/agents/1158
#     - gutenberg_name:  Smith, Adam
#     - gutenberg_aliases:
#     - aliases     :
#     - web_pages   :
#             - http://en.wikipedia.org/wiki/Adam_Smith
#     - birth_date  :         1723
#     - death_date  :         1790
Source code in dhtk/data_sources/gutenberg/tools/author.py
class GutenbergAuthor(Author):
    """The class for an author extended with variables from the gutenberg project.
    Child of the generic Author class.

    Example
    --------

        from pprint import pprint
        from  dhtk.data_sources.gutenberg.author import GutenbergAuthor

        # Create an author manually.
        adam_smith_author = GutenbergAuthor(
            gutenberg_id='http://www.gutenberg.org/2009/agents/1158',
            name='Smith, Adam'
        )

        # create an author form the gutenberg repository

        from dhtk.data_sources.gutenberg.data import GutenbergData

        database = GutenbergData()
        database.search_author_by_name("Adam", "Smith")
        # [('Smith, Adam', 'http://www.gutenberg.org/2009/agents/1158'),
        #  ('Smith, George Adam', 'http://www.gutenberg.org/2009/agents/5016')]

        adam_smith_author_pg = database.author_from_author_id(
            'http://www.gutenberg.org/2009/agents/1158'
        )

        adam_smith_author_pg.print_info()
        # Adam Smith
        # Metadata    :
        #     - gutenberg_id: http://www.gutenberg.org/2009/agents/1158
        #     - id          : http://www.gutenberg.org/2009/agents/1158
        #     - gutenberg_name:  Smith, Adam
        #     - gutenberg_aliases:
        #     - aliases     :
        #     - web_pages   :
        #             - http://en.wikipedia.org/wiki/Adam_Smith
        #     - birth_date  :         1723
        #     - death_date  :         1790

    Args:

    Returns:

    Args:

    Returns:


    """

    def __init__(self, gutenberg_id, name, aliases=None, web_pages=None, same_as=None, **kwargs):
        """
        Init function of GutenbergAuthor.

        Notes:
            Implement the Abstract Author class and extend it with the gutenberg id, the eventual alias(es) and the eventual web page(s) of the author. Converts name into a HumanName object with convert_name.

        Parameters:
            gutenberg_id (str) : URI of the gutenberg author in the Gutenberg RDF.
            name (str) : Name of the author.
            aliases set(str):   Eventual aliases of the author.default None
            web_pages set(str):  Eventual web pages of the author. default None

        """
        if same_as is None:
            same_as = {}
        id_format = re.compile(r"http://www.gutenberg.org/2009/agents/\d+$")
        if not id_format.fullmatch(gutenberg_id):
            raise ReferenceError(f"This gutenberg id is not valid: {gutenberg_id}")
        self.metadata["gutenberg_id"] = gutenberg_id
        self.metadata["id"] = gutenberg_id

        self.metadata["gutenberg_name"] = name

        if not isinstance(aliases, set):
            aliases = set()
        self.metadata["aliases"] = aliases

        if not isinstance(web_pages, set):
            web_pages = set()
        self.metadata["web_pages"] = web_pages

        # Add saint to nameparser's FIRST_NAME_TITLES
        FIRST_NAME_TITLES.add("saint")
        name = self.convert_name(name)

        # LOGGER.debug("converting aliases names: %s", ", ".join(aliases))
        self.metadata["aliases"] = {str(self.convert_name(alias)) for alias in aliases}
        # LOGGER.debug("aliases: %s", ", ".join(self.metadata["aliases"]))

        if str(name) in self.metadata["aliases"]:
            # LOGGER.debug("removing '%s' from %s", str(name), ", ".join(self.metadata["aliases"]))
            self.metadata["aliases"].remove(str(name))

        metadata = copy.copy(self.metadata)
        super().__init__(
            name,
            same_as=same_as,
            metadata=metadata,
            **kwargs
        )

    def convert_name(self, human_name):
        """Convert human_name string containing into a HumanName object.

       Args:
          human_name(str): Author's name in a string.

        Returns:


        Notes:
            Is done in the init for every GutenbergAuthor object with initial string parameter "name".
        """

        human_name = HumanName(human_name)
        if human_name.suffix:
            self.metadata["gutenberg_name_suffix"] = human_name.suffix
            human_name.suffix = ""
        if human_name.nickname:
            # LOGGER.debug("%s nickname: %s", str(human_name), human_name.nickname)
            no_nickname = copy.copy(human_name)
            no_nickname.nickname = ""
            first_name_match = re.match(
                re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.first, re.UNICODE),
                human_name.nickname,
                re.UNICODE
            )
            # LOGGER.debug(
            #     "%s, %s",
            #     re.sub(
            #         r"(([A-Z])[a-z]*[.])", r"\2\\w+",
            #         human_name.first,
            #         re.UNICODE
            #     ),
            #     human_name.nickname
            # )
            if first_name_match and len(first_name_match.group(0)) >= len(human_name.first):
                human_name.first = first_name_match.group(0)
                human_name.nickname = human_name.nickname[len(human_name.first):].strip()
                # LOGGER.debug("Adding %s to aliases", str(no_nickname))
                self.metadata["aliases"] = {str(no_nickname), }
            middle_name_match = re.match(
                re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.middle, re.UNICODE),
                human_name.nickname,
                re.UNICODE
            )
            # LOGGER.debug(
            #     "%s, %s",
            #     re.sub(
            #         r"(([A-Z])[a-z]*[.])", r"\2\\w+",
            #         human_name.middle, re.UNICODE
            #     ),
            #     human_name.nickname
            # )
            if middle_name_match and len(middle_name_match.group(0)) >= len(human_name.middle):
                human_name.middle = middle_name_match.group(0)
                human_name.nickname = human_name.nickname[len(human_name.middle):].strip()
                # LOGGER.debug("Adding %s to aliases", str(no_nickname))
                self.metadata["aliases"].add(str(no_nickname))
        return human_name

    def get_gutenberg_id(self):
        """
        Get the gutenberg id url of the author.

        Returns:
            str: the guteneberg id

        """
        return self.metadata["gutenberg_id"]

    def __eq__(self, other):
        """
        Equality function between authors.

        Notes:
            Test the equality of the two authors. Using the gutenberg_id if other is an instance of
            GutenbergAuthor. If not, it uses the dhtk.common.author.Author.__eq__() method that uses the
            author's names (first, last) and its birthdate.

        Args:
            other (dhtk.common.author.Author): An instance of dhtk.common.author.Author or cof its child-classes.

        Returns:
            equality (bool) : A bool that tells if the authors are the same or not.

        """
        if isinstance(other, GutenbergAuthor):
            equals = self.get_gutenberg_id() == other.get_gutenberg_id()
        else:
            equals = super().__eq__(other)
        return equals

    def __hash__(self):
        """
        Return hash for the author.

        Returns:
            hash (int) : The hash value for the author.
        """
        return hash((self.get_first_name() + self.get_last_name() + self.get_birth_date()))

    def __repr__(self):
        """

        Returns:
            object_str (str) : String representing the object
        """

        return f"<Author: {self.get_last_name()}, {self.get_first_name()}" \
               f"({self.get_gutenberg_id()}>)"
__eq__(self, other) special

Equality function between authors.

!!! notes Test the equality of the two authors. Using the gutenberg_id if other is an instance of GutenbergAuthor. If not, it uses the dhtk.common.author.Author.eq() method that uses the author's names (first, last) and its birthdate.

Parameters:

Name Type Description Default
other dhtk.common.author.Author

An instance of dhtk.common.author.Author or cof its child-classes.

required

Returns:

Type Description
equality (bool)

A bool that tells if the authors are the same or not.

Source code in dhtk/data_sources/gutenberg/tools/author.py
def __eq__(self, other):
    """
    Equality function between authors.

    Notes:
        Test the equality of the two authors. Using the gutenberg_id if other is an instance of
        GutenbergAuthor. If not, it uses the dhtk.common.author.Author.__eq__() method that uses the
        author's names (first, last) and its birthdate.

    Args:
        other (dhtk.common.author.Author): An instance of dhtk.common.author.Author or cof its child-classes.

    Returns:
        equality (bool) : A bool that tells if the authors are the same or not.

    """
    if isinstance(other, GutenbergAuthor):
        equals = self.get_gutenberg_id() == other.get_gutenberg_id()
    else:
        equals = super().__eq__(other)
    return equals
__hash__(self) special

Return hash for the author.

Returns:

Type Description
hash (int)

The hash value for the author.

Source code in dhtk/data_sources/gutenberg/tools/author.py
def __hash__(self):
    """
    Return hash for the author.

    Returns:
        hash (int) : The hash value for the author.
    """
    return hash((self.get_first_name() + self.get_last_name() + self.get_birth_date()))
__init__(self, gutenberg_id, name, aliases=None, web_pages=None, same_as=None, **kwargs) special

Init function of GutenbergAuthor.

!!! notes Implement the Abstract Author class and extend it with the gutenberg id, the eventual alias(es) and the eventual web page(s) of the author. Converts name into a HumanName object with convert_name.

Parameters:

Name Type Description Default
gutenberg_id str)

URI of the gutenberg author in the Gutenberg RDF.

required
name str)

Name of the author.

required
aliases set(str

Eventual aliases of the author.default None

None
web_pages set(str

Eventual web pages of the author. default None

None
Source code in dhtk/data_sources/gutenberg/tools/author.py
def __init__(self, gutenberg_id, name, aliases=None, web_pages=None, same_as=None, **kwargs):
    """
    Init function of GutenbergAuthor.

    Notes:
        Implement the Abstract Author class and extend it with the gutenberg id, the eventual alias(es) and the eventual web page(s) of the author. Converts name into a HumanName object with convert_name.

    Parameters:
        gutenberg_id (str) : URI of the gutenberg author in the Gutenberg RDF.
        name (str) : Name of the author.
        aliases set(str):   Eventual aliases of the author.default None
        web_pages set(str):  Eventual web pages of the author. default None

    """
    if same_as is None:
        same_as = {}
    id_format = re.compile(r"http://www.gutenberg.org/2009/agents/\d+$")
    if not id_format.fullmatch(gutenberg_id):
        raise ReferenceError(f"This gutenberg id is not valid: {gutenberg_id}")
    self.metadata["gutenberg_id"] = gutenberg_id
    self.metadata["id"] = gutenberg_id

    self.metadata["gutenberg_name"] = name

    if not isinstance(aliases, set):
        aliases = set()
    self.metadata["aliases"] = aliases

    if not isinstance(web_pages, set):
        web_pages = set()
    self.metadata["web_pages"] = web_pages

    # Add saint to nameparser's FIRST_NAME_TITLES
    FIRST_NAME_TITLES.add("saint")
    name = self.convert_name(name)

    # LOGGER.debug("converting aliases names: %s", ", ".join(aliases))
    self.metadata["aliases"] = {str(self.convert_name(alias)) for alias in aliases}
    # LOGGER.debug("aliases: %s", ", ".join(self.metadata["aliases"]))

    if str(name) in self.metadata["aliases"]:
        # LOGGER.debug("removing '%s' from %s", str(name), ", ".join(self.metadata["aliases"]))
        self.metadata["aliases"].remove(str(name))

    metadata = copy.copy(self.metadata)
    super().__init__(
        name,
        same_as=same_as,
        metadata=metadata,
        **kwargs
    )
__repr__(self) special

Returns:

Type Description
object_str (str)

String representing the object

Source code in dhtk/data_sources/gutenberg/tools/author.py
def __repr__(self):
    """

    Returns:
        object_str (str) : String representing the object
    """

    return f"<Author: {self.get_last_name()}, {self.get_first_name()}" \
           f"({self.get_gutenberg_id()}>)"
convert_name(self, human_name)

Convert human_name string containing into a HumanName object.

Parameters:

Name Type Description Default
human_name(str)

Author's name in a string.

required

Returns:

!!! notes Is done in the init for every GutenbergAuthor object with initial string parameter "name".

Source code in dhtk/data_sources/gutenberg/tools/author.py
def convert_name(self, human_name):
    """Convert human_name string containing into a HumanName object.

   Args:
      human_name(str): Author's name in a string.

    Returns:


    Notes:
        Is done in the init for every GutenbergAuthor object with initial string parameter "name".
    """

    human_name = HumanName(human_name)
    if human_name.suffix:
        self.metadata["gutenberg_name_suffix"] = human_name.suffix
        human_name.suffix = ""
    if human_name.nickname:
        # LOGGER.debug("%s nickname: %s", str(human_name), human_name.nickname)
        no_nickname = copy.copy(human_name)
        no_nickname.nickname = ""
        first_name_match = re.match(
            re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.first, re.UNICODE),
            human_name.nickname,
            re.UNICODE
        )
        # LOGGER.debug(
        #     "%s, %s",
        #     re.sub(
        #         r"(([A-Z])[a-z]*[.])", r"\2\\w+",
        #         human_name.first,
        #         re.UNICODE
        #     ),
        #     human_name.nickname
        # )
        if first_name_match and len(first_name_match.group(0)) >= len(human_name.first):
            human_name.first = first_name_match.group(0)
            human_name.nickname = human_name.nickname[len(human_name.first):].strip()
            # LOGGER.debug("Adding %s to aliases", str(no_nickname))
            self.metadata["aliases"] = {str(no_nickname), }
        middle_name_match = re.match(
            re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.middle, re.UNICODE),
            human_name.nickname,
            re.UNICODE
        )
        # LOGGER.debug(
        #     "%s, %s",
        #     re.sub(
        #         r"(([A-Z])[a-z]*[.])", r"\2\\w+",
        #         human_name.middle, re.UNICODE
        #     ),
        #     human_name.nickname
        # )
        if middle_name_match and len(middle_name_match.group(0)) >= len(human_name.middle):
            human_name.middle = middle_name_match.group(0)
            human_name.nickname = human_name.nickname[len(human_name.middle):].strip()
            # LOGGER.debug("Adding %s to aliases", str(no_nickname))
            self.metadata["aliases"].add(str(no_nickname))
    return human_name
get_gutenberg_id(self)

Get the gutenberg id url of the author.

Returns:

Type Description
str

the guteneberg id

Source code in dhtk/data_sources/gutenberg/tools/author.py
def get_gutenberg_id(self):
    """
    Get the gutenberg id url of the author.

    Returns:
        str: the guteneberg id

    """
    return self.metadata["gutenberg_id"]

book

Contains the GutenbergBook Class.

Returns a Book object which is extended to contain parameters for gutenberg id

GutenbergBook (Book)

Extends the Book class for Project Gutenberg books.

Examples:

>>> from pprint import pprint
>>> from  dhtk.data_sources.gutenberg.tools.book import GutenbergBook
>>> from  dhtk.data_sources.gutenberg.tools.author import GutenbergAuthor
>>> moby_dick = GutenbergBook(
>>>     title="Moby Dick",
>>>    gutenberg_id="http://www.gutenberg.org/ebooks/2489",
>>>    author=GutenbergAuthor(
>>>         gutenberg_id='http://www.gutenberg.org/2009/agents/9',
>>>         name='Melville, Herman'
>>>     )
>>> )
>>> from dhtk.data_sources.gutenberg.api.data import GutenbergData
>>> gutenberg_search = GutenbergData()
>>> book = gutenberg_search.book_from_book_id("http://www.gutenberg.org/ebooks/2701")
>>> book.print_info()
# Title       : Moby Dick; Or, The Whale
# Author      : Herman Melville
# Metadata    :
#     - gutenberg_id: http://www.gutenberg.org/ebooks/2701
Source code in dhtk/data_sources/gutenberg/tools/book.py
class GutenbergBook(Book):
    """Extends the Book class for Project Gutenberg books.


    Examples:
        >>> from pprint import pprint
        >>> from  dhtk.data_sources.gutenberg.tools.book import GutenbergBook
        >>> from  dhtk.data_sources.gutenberg.tools.author import GutenbergAuthor

        >>> moby_dick = GutenbergBook(
        >>>     title="Moby Dick",
        >>>    gutenberg_id="http://www.gutenberg.org/ebooks/2489",
        >>>    author=GutenbergAuthor(
        >>>         gutenberg_id='http://www.gutenberg.org/2009/agents/9',
        >>>         name='Melville, Herman'
        >>>     )
        >>> )

        >>> from dhtk.data_sources.gutenberg.api.data import GutenbergData

        >>> gutenberg_search = GutenbergData()
        >>> book = gutenberg_search.book_from_book_id("http://www.gutenberg.org/ebooks/2701")
        >>> book.print_info()
        # Title       : Moby Dick; Or, The Whale
        # Author      : Herman Melville
        # Metadata    :
        #     - gutenberg_id: http://www.gutenberg.org/ebooks/2701


    """

    def __init__(self, gutenberg_id, title, author, same_as=None, **kwargs):
        """
        Init function of the GutenbergBook Class.

        Args:
        gutenberg_id (str) :  Must start with "http://www.gutenberg.org/ebooks/".

        author (dhtk.common.author.Author) :The object containing the author of the book. Of type dhtk.common.author.Author or a subclass of it.

        title (str) :The title of the book, in format given by Gutenberg.

        same_as (dict): A dictionary containing same_as URIs.

        **kwargs (dict) : Will be used as metadata.
        """
        if same_as is None:
            same_as = dict()
        id_format = re.compile(r"http://www.gutenberg.org/ebooks/\d+$")
        if not id_format.fullmatch(gutenberg_id):
            # LOGGER.error("This gutenberg id is not valid! %s", gutenberg_id)
            raise ReferenceError("This gutenberg id is not valid! %s" % gutenberg_id)

        title = re.sub(r"\s+", " ", title)
        super().__init__(title=title, author=author, gutenberg_id=gutenberg_id, same_as=same_as, metadata=kwargs)

    def get_book_id(self):
        """

        Returns:

        """

        return self.metadata.get("gutenberg_id", "")

    def get_uri(self):
        """

        Returns:

        """

        return self.metadata.get("gutenberg_id", "")

    def get_book_id_number(self):
        """

        Returns:

        """
        return self.metadata.get("gutenberg_id", "/").rsplit("/", 1)[1]

    def get_text_file_dir_path(self):
        """Return the suffix of the uri of the book in a gutenberg text repository.

        Args:

        Returns:
          str: Returns the suffix of the gutenberg file repository where the file is to be found:

        Args:

        Returns:

        Notes
        -----

        This method is generally used with::

            "file://gutenberg/repository/path/" + self.get_text_file_dir_path() + "-file.extension"
            #or
            "http://distant.gutenberg-repository.path" + self.get_text_file_dir_path() + "-file.extension"

        the "-file.extension" can be -0.txt, .zip, .txt depending on the presence in the repository
        and on the file encoding.

        Example
        -------

            print(book.get_text_file_dir_path())
            # "2/7/0/2701/2701"
        """
        # LOGGER.debug("id: %s", self.metadata.get("gutenberg_id", ""))
        gutenberg_id_num = self.get_book_id_number()
        if int(gutenberg_id_num) < 10:
            subdir = "0/{0}/{0}".format(gutenberg_id_num)
        elif int(gutenberg_id_num) < 100:
            subdir = "{0}/{1}/{1}".format(gutenberg_id_num[0], gutenberg_id_num)
        elif int(gutenberg_id_num) < 1000:
            subdir = "{0}/{1}/{2}/{2}".format(
                gutenberg_id_num[0],
                gutenberg_id_num[1],
                gutenberg_id_num
            )
        else:
            gutenberg_id_string = str(gutenberg_id_num).zfill(2)
            all_but_last_digit = list(gutenberg_id_string[:-1])
            subdir_part = "/".join(all_but_last_digit)
            subdir = "{0}/{1}/{1}".format(subdir_part, gutenberg_id_num)
        return subdir

    def repository(self):
        """ """
        repo = GutenbergTexts(self)

        return repo

    def original_text(self):
        """ """

        text = self.repository().get_original_text()

        return text

    def __eq__(self, other):
        """
        Equality function.

        Notes
        -----
        Test the equality of the two books. Using the gutenberg_id if other is an instance of
        GutenbergBook. If not, it uses the dhtk.common.author.Book.__eq__() method that uses the
        book's authors and titles.

        Parameters
        ----------
            other: an instance from dhtk.data_sources.templates.Book or any child class.

        Returns
        -------
            equality: bool
        """
        if isinstance(other, GutenbergBook):
            equals = self.get_book_id() == other.get_book_id()
        else:
            equals = super().__eq__(other)
        return equals

    def __hash__(self):
        """
        Returns hash of attributes of gutenberg book.

        Notes
        -----
        The hash is created from:
            - author
            - title
            - first date (from metadata)
        Allows dictionary keys to be compared quickly.

        Returns
        -------
            hash: int
        """
        return hash(self._author.get_full_name() + self._title + self.get_first_edition_date())

    def __repr__(self):
        """

        Returns
        -------
        object_str : String representing the object
        """

        return "<GutenbergBook: %s - %s gutenberg_id: %s>" % (
            self.get_author().get_full_name(),
            self.get_title(),
            self.get_book_id_number()
        )
__eq__(self, other) special

Equality function.

Notes

Test the equality of the two books. Using the gutenberg_id if other is an instance of GutenbergBook. If not, it uses the dhtk.common.author.Book.eq() method that uses the book's authors and titles.

Parameters
other: an instance from dhtk.data_sources.templates.Book or any child class.
Returns
equality: bool
Source code in dhtk/data_sources/gutenberg/tools/book.py
def __eq__(self, other):
    """
    Equality function.

    Notes
    -----
    Test the equality of the two books. Using the gutenberg_id if other is an instance of
    GutenbergBook. If not, it uses the dhtk.common.author.Book.__eq__() method that uses the
    book's authors and titles.

    Parameters
    ----------
        other: an instance from dhtk.data_sources.templates.Book or any child class.

    Returns
    -------
        equality: bool
    """
    if isinstance(other, GutenbergBook):
        equals = self.get_book_id() == other.get_book_id()
    else:
        equals = super().__eq__(other)
    return equals
__hash__(self) special

Returns hash of attributes of gutenberg book.

Notes

The hash is created from: - author - title - first date (from metadata) Allows dictionary keys to be compared quickly.

Returns
hash: int
Source code in dhtk/data_sources/gutenberg/tools/book.py
def __hash__(self):
    """
    Returns hash of attributes of gutenberg book.

    Notes
    -----
    The hash is created from:
        - author
        - title
        - first date (from metadata)
    Allows dictionary keys to be compared quickly.

    Returns
    -------
        hash: int
    """
    return hash(self._author.get_full_name() + self._title + self.get_first_edition_date())
__init__(self, gutenberg_id, title, author, same_as=None, **kwargs) special

Init function of the GutenbergBook Class.

gutenberg_id (str) : Must start with "http://www.gutenberg.org/ebooks/".

author (dhtk.common.author.Author) :The object containing the author of the book. Of type dhtk.common.author.Author or a subclass of it.

title (str) :The title of the book, in format given by Gutenberg.

same_as (dict): A dictionary containing same_as URIs.

**kwargs (dict) : Will be used as metadata.

Source code in dhtk/data_sources/gutenberg/tools/book.py
def __init__(self, gutenberg_id, title, author, same_as=None, **kwargs):
    """
    Init function of the GutenbergBook Class.

    Args:
    gutenberg_id (str) :  Must start with "http://www.gutenberg.org/ebooks/".

    author (dhtk.common.author.Author) :The object containing the author of the book. Of type dhtk.common.author.Author or a subclass of it.

    title (str) :The title of the book, in format given by Gutenberg.

    same_as (dict): A dictionary containing same_as URIs.

    **kwargs (dict) : Will be used as metadata.
    """
    if same_as is None:
        same_as = dict()
    id_format = re.compile(r"http://www.gutenberg.org/ebooks/\d+$")
    if not id_format.fullmatch(gutenberg_id):
        # LOGGER.error("This gutenberg id is not valid! %s", gutenberg_id)
        raise ReferenceError("This gutenberg id is not valid! %s" % gutenberg_id)

    title = re.sub(r"\s+", " ", title)
    super().__init__(title=title, author=author, gutenberg_id=gutenberg_id, same_as=same_as, metadata=kwargs)
__repr__(self) special
Returns

object_str : String representing the object

Source code in dhtk/data_sources/gutenberg/tools/book.py
def __repr__(self):
    """

    Returns
    -------
    object_str : String representing the object
    """

    return "<GutenbergBook: %s - %s gutenberg_id: %s>" % (
        self.get_author().get_full_name(),
        self.get_title(),
        self.get_book_id_number()
    )
get_book_id(self)
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_book_id(self):
    """

    Returns:

    """

    return self.metadata.get("gutenberg_id", "")
get_book_id_number(self)
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_book_id_number(self):
    """

    Returns:

    """
    return self.metadata.get("gutenberg_id", "/").rsplit("/", 1)[1]
get_text_file_dir_path(self)

Return the suffix of the uri of the book in a gutenberg text repository.

Returns:

Type Description
str

Returns the suffix of the gutenberg file repository where the file is to be found:

Notes

This method is generally used with::

"file://gutenberg/repository/path/" + self.get_text_file_dir_path() + "-file.extension"
#or
"http://distant.gutenberg-repository.path" + self.get_text_file_dir_path() + "-file.extension"

the "-file.extension" can be -0.txt, .zip, .txt depending on the presence in the repository and on the file encoding.

Example
print(book.get_text_file_dir_path())
# "2/7/0/2701/2701"
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_text_file_dir_path(self):
    """Return the suffix of the uri of the book in a gutenberg text repository.

    Args:

    Returns:
      str: Returns the suffix of the gutenberg file repository where the file is to be found:

    Args:

    Returns:

    Notes
    -----

    This method is generally used with::

        "file://gutenberg/repository/path/" + self.get_text_file_dir_path() + "-file.extension"
        #or
        "http://distant.gutenberg-repository.path" + self.get_text_file_dir_path() + "-file.extension"

    the "-file.extension" can be -0.txt, .zip, .txt depending on the presence in the repository
    and on the file encoding.

    Example
    -------

        print(book.get_text_file_dir_path())
        # "2/7/0/2701/2701"
    """
    # LOGGER.debug("id: %s", self.metadata.get("gutenberg_id", ""))
    gutenberg_id_num = self.get_book_id_number()
    if int(gutenberg_id_num) < 10:
        subdir = "0/{0}/{0}".format(gutenberg_id_num)
    elif int(gutenberg_id_num) < 100:
        subdir = "{0}/{1}/{1}".format(gutenberg_id_num[0], gutenberg_id_num)
    elif int(gutenberg_id_num) < 1000:
        subdir = "{0}/{1}/{2}/{2}".format(
            gutenberg_id_num[0],
            gutenberg_id_num[1],
            gutenberg_id_num
        )
    else:
        gutenberg_id_string = str(gutenberg_id_num).zfill(2)
        all_but_last_digit = list(gutenberg_id_string[:-1])
        subdir_part = "/".join(all_but_last_digit)
        subdir = "{0}/{1}/{1}".format(subdir_part, gutenberg_id_num)
    return subdir
get_uri(self)
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_uri(self):
    """

    Returns:

    """

    return self.metadata.get("gutenberg_id", "")

texts

Contains GutenbergTexts Class.

Notes

This class is being reworked. The cleaning of the texts was adapted from: https://github.com/okfn/gutenizer

GutenbergTexts

Clean up Gutenberg texts by removing all the header and footer bumpf.

Notes

Part of this class have to be reworked.

Usage : init and then run _extract_text. _notes_end = "" _header_end = "" _footer_start = "" _original_text = "" _clean_text = "" _url = ""

Source code in dhtk/data_sources/gutenberg/tools/texts.py
class GutenbergTexts:
    """
    Clean up Gutenberg texts by removing all the header and footer bumpf.

    Args:

    Returns:

    Notes
    -----
    Part of this class have to be reworked.

    Usage : init and then run _extract_text.
    _notes_end = ""
    _header_end = ""
    _footer_start = ""
    _original_text = ""
    _clean_text = ""
    _url = ""

"""
    def __init__(self, book, repository_uri='http://aleph.gutenberg.org'):
        """
        Init function of the GutenbergTexts.

        Check repository_uri and create a temporary directory for file operations.
        repository_uri: can be local file:/path/to/dir
        refer to:
        https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
        to download the files.

        Parameters
        ------------------
        repository_uri : str
            Can be a file uri file://home/user/Documents/gutenberg_dump or
            a http uri: http://aleph.gutenberg.org
        """
        self._original_text = None
        if not repository_uri:
            raise ValueError("Please set the URI of a 'local' gutenberg text repository.")

        if "http://www.gutenberg.org/files" in repository_uri:
            raise ValueError(
                """
                Please create a local repository. More information on:
                https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
                """
            )

        self._temporary_dir = Path(mkdtemp(prefix="dhtk-"))
        self._repository_uri = repository_uri
        self.book = book

    def get_original_text(self):
        """Returns original text of a given book."""

        found_url = False
        url = ""
        if self._original_text:
            return self._original_text

        base_url = self._repository_uri + "/" + self.book.get_text_file_dir_path()

        valid_extensions = ("-0.txt", "-8.txt", ".txt")
        if self._repository_uri.startswith("file://"):
            valid_extensions = ("-0.txt", "-8.txt", ".txt", "-0.zip", "-8.zip", ".zip")
        for extension in valid_extensions:
            url = base_url + extension
            try:
                found_url = url_exists(url)
            except:  # aleph is not reliable, just use gutenberg directly for now
                url = re.sub(self._repository_uri, "http://www.gutenberg.org/files", url)
                id = self.book.get_book_id_number()
                url = re.sub(self.book.get_text_file_dir_path(), f"{id}/{id}", url)

                found_url = url_exists(url)

            if found_url:
                break

        # TODO: once search does not find audio editions anymore uncomment this:
        # if not found_url:
        #     raise Warning(
        #        "Could not find the text file for {} {}.".format(
        #           book.get_author(),
        #           book.get_title()
        #       )
        #    )
        # TODO: once search does not find audio anymore editions remove this:
        if not found_url:
            return None

        try:
            raw_file_path = download_files(url, self._temporary_dir / self.book.get_text_file_name(), self.book._title)
            if raw_file_path.endswith(".zip"):
                self._original_text = unarchive_book(raw_file_path)
                path = pathlib.Path(raw_file_path)
                path.unlink()

            else:
                with open(raw_file_path, "r", encoding="utf8", errors='ignore') as book_text_file:
                    self._original_text = book_text_file.read()
                path = pathlib.Path(raw_file_path)
                path.unlink()

        except Exception as ex:
            raise ex

        return self._original_text

    def save_original_text_file_to(self, destination):
        """Save the original text to a text-file in or at destination.

        Args:
          destination(str): Path of the destination where the text will be saved.

        Returns:

        """
        destination = pathlib.Path(destination)
        filename = self.book.get_text_file_name()
        filename = destination / filename
        if filename.is_file() and filename.stat().st_size == 0:
            return filename

        self.get_original_text()

        if not destination.is_dir():
            destination.mkdir(parents=True, exist_ok=True)

        try:
            with open(filename, "w", encoding='utf8') as file_writer:
                file_writer.write(self._original_text)
        except IOError:
            # LOGGER.warning("File %s could not be created.", filename)
            print("File %s could not be created.", filename)
        return filename

    # def save_clean_text_file_to(self, destination):
    #     """Save the clean text to a text-file in or at destination.
    #
    #     Args:
    #       destination(str): Path of the destination where the text will be saved.
    #
    #     Returns:
    #
    #     """
    #     self.get_original_text()
    #
    #     destination = Path(destination)
    #
    #     if not destination.is_dir():
    #         destination.mkdir(parents=True, exist_ok=True)
    #
    #     filename = self.book.get_text_file_name()
    #
    #     filename = destination / filename
    #     if not filename.is_file() or filename.stat().st_size == 0:
    #         with open(filename, "w") as file_writer:
    #             file_writer.write(self._clean_text)
    #
    #     return filename

    def __del__(self):
        try:
            if self._temporary_dir.is_dir():
                shutil.rmtree(self._temporary_dir)
        except NameError:
            pass

    def close(self):
        """Remove temporary directory if instance is closed."""
        try:
            if self._temporary_dir.is_dir():
                shutil.rmtree(self._temporary_dir)
        except NameError:
            pass
__init__(self, book, repository_uri='http://aleph.gutenberg.org') special

Init function of the GutenbergTexts.

Check repository_uri and create a temporary directory for file operations. repository_uri: can be local file:/path/to/dir refer to: https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages to download the files.

Parameters

repository_uri : str Can be a file uri file://home/user/Documents/gutenberg_dump or a http uri: http://aleph.gutenberg.org

Source code in dhtk/data_sources/gutenberg/tools/texts.py
def __init__(self, book, repository_uri='http://aleph.gutenberg.org'):
    """
    Init function of the GutenbergTexts.

    Check repository_uri and create a temporary directory for file operations.
    repository_uri: can be local file:/path/to/dir
    refer to:
    https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
    to download the files.

    Parameters
    ------------------
    repository_uri : str
        Can be a file uri file://home/user/Documents/gutenberg_dump or
        a http uri: http://aleph.gutenberg.org
    """
    self._original_text = None
    if not repository_uri:
        raise ValueError("Please set the URI of a 'local' gutenberg text repository.")

    if "http://www.gutenberg.org/files" in repository_uri:
        raise ValueError(
            """
            Please create a local repository. More information on:
            https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
            """
        )

    self._temporary_dir = Path(mkdtemp(prefix="dhtk-"))
    self._repository_uri = repository_uri
    self.book = book
close(self)

Remove temporary directory if instance is closed.

Source code in dhtk/data_sources/gutenberg/tools/texts.py
def close(self):
    """Remove temporary directory if instance is closed."""
    try:
        if self._temporary_dir.is_dir():
            shutil.rmtree(self._temporary_dir)
    except NameError:
        pass
get_original_text(self)

Returns original text of a given book.

Source code in dhtk/data_sources/gutenberg/tools/texts.py
def get_original_text(self):
    """Returns original text of a given book."""

    found_url = False
    url = ""
    if self._original_text:
        return self._original_text

    base_url = self._repository_uri + "/" + self.book.get_text_file_dir_path()

    valid_extensions = ("-0.txt", "-8.txt", ".txt")
    if self._repository_uri.startswith("file://"):
        valid_extensions = ("-0.txt", "-8.txt", ".txt", "-0.zip", "-8.zip", ".zip")
    for extension in valid_extensions:
        url = base_url + extension
        try:
            found_url = url_exists(url)
        except:  # aleph is not reliable, just use gutenberg directly for now
            url = re.sub(self._repository_uri, "http://www.gutenberg.org/files", url)
            id = self.book.get_book_id_number()
            url = re.sub(self.book.get_text_file_dir_path(), f"{id}/{id}", url)

            found_url = url_exists(url)

        if found_url:
            break

    # TODO: once search does not find audio editions anymore uncomment this:
    # if not found_url:
    #     raise Warning(
    #        "Could not find the text file for {} {}.".format(
    #           book.get_author(),
    #           book.get_title()
    #       )
    #    )
    # TODO: once search does not find audio anymore editions remove this:
    if not found_url:
        return None

    try:
        raw_file_path = download_files(url, self._temporary_dir / self.book.get_text_file_name(), self.book._title)
        if raw_file_path.endswith(".zip"):
            self._original_text = unarchive_book(raw_file_path)
            path = pathlib.Path(raw_file_path)
            path.unlink()

        else:
            with open(raw_file_path, "r", encoding="utf8", errors='ignore') as book_text_file:
                self._original_text = book_text_file.read()
            path = pathlib.Path(raw_file_path)
            path.unlink()

    except Exception as ex:
        raise ex

    return self._original_text
save_original_text_file_to(self, destination)

Save the original text to a text-file in or at destination.

Parameters:

Name Type Description Default
destination(str)

Path of the destination where the text will be saved.

required
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def save_original_text_file_to(self, destination):
    """Save the original text to a text-file in or at destination.

    Args:
      destination(str): Path of the destination where the text will be saved.

    Returns:

    """
    destination = pathlib.Path(destination)
    filename = self.book.get_text_file_name()
    filename = destination / filename
    if filename.is_file() and filename.stat().st_size == 0:
        return filename

    self.get_original_text()

    if not destination.is_dir():
        destination.mkdir(parents=True, exist_ok=True)

    try:
        with open(filename, "w", encoding='utf8') as file_writer:
            file_writer.write(self._original_text)
    except IOError:
        # LOGGER.warning("File %s could not be created.", filename)
        print("File %s could not be created.", filename)
    return filename
unarchive_book(path, destination=None)

Parameters:

Name Type Description Default
path(str)

Path of the archive of a book. A Zip file containing a single txt file.

required
destination(str, optional

Path where the texfile should be extracted. (Default value = None)

required
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def unarchive_book(path, destination=None):
    """

    Args:
      path(str): Path of the archive of a book. A Zip file containing a single txt file.
      destination(str, optional): Path where the texfile should be extracted. (Default value = None)

    Returns:

    """
    title = path.rsplit("/", 1)[1].replace(".zip", "")
    archive = zipfile.ZipFile(path, 'r')
    raw_text = ""
    for txt_file in archive.namelist():
        print(title)
        if txt_file.endswith(".txt"):
            raw_text = archive.read(txt_file)
            break

    detect = chardet.detect(raw_text)

    raw_text = raw_text.decode(detect["encoding"])
    if destination:
        try:
            with open(destination, "w")as out_file:
                out_file.write(destination)
        except IOError:
            # TODO: fix logger this
            raise IOError("%s could not be written.", destination)

    return raw_text