DHTK
dhtk.core.system
Utility functions used by all modules.
download_files(urls, path='./', file_names='')
Function to download files from the Web.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urls |
Union[str, List[str]] |
URLs of the files to be downloaded |
required |
path |
str |
Path to directory to store the downloaded files. (Default value = "./") |
'./' |
file_names |
Union[str, List[str] |
Output name or names of the files to be downloaded. |
'' |
Source code in dhtk/core/system.py
def download_files(
urls: typing.Union[str, typing.List[str]],
path: str = "./",
file_names: str = "") -> \
typing.Union[str, typing.List[str]]:
"""Function to download files from the Web.
Args:
urls (typing.Union[str, typing.List[str]]): URLs of the files to be downloaded
path (str, optional): Path to directory to store the downloaded files. (Default value = "./")
file_names (typing.Union[str, typing.List[str], optional): Output name or names of the files to be downloaded.
"""
# Convert inputs to list
if isinstance(urls, str):
urls = [urls]
path = pathlib.Path(path)
path.mkdir(exist_ok=True)
# Request all files
get_file_names = False
if not file_names:
file_names = []
get_file_names = True
elif isinstance(file_names, str):
file_names = [file_names, ]
elif isinstance(file_names, list):
if len(file_names) != len(urls):
raise IndexError("The list of filenames should correspond to the list of urls.")
for index, url in enumerate(urls):
if url_exists(url): # Check if URL is valid
if get_file_names:
file_name = url.split('/')[-1]
file_name = file_name.split('?')[0]
else:
file_name = file_names[index]
file_paths = [path / file for file in file_names]
if all(file.exists() for file in file_paths):
if len(file_paths) == 1:
return file_paths[0]
return file_paths
# Make request
headers = {'User-Agent': get_platform()}
with requests.get(url, stream=True, headers=headers) as request:
request.raise_for_status()
chunk = 8192
total = int(request.headers['Content-Length'])
# Read to file
with open(path / file_name, 'wb') as out_file:
with tqdm.tqdm(total=total, desc=f"Downloading {file_name}") as progress_bar:
for part in request.iter_content(chunk_size=chunk):
out_file.write(part)
progress_bar.update(chunk)
logger.info("DOWNLOAD: %s downloaded from %s to %s", file_name, url, path)
else: # Warn if URL is not valid
msg = f"URL not available: {url}"
warnings.warn(msg)
logger.warning("DOWNLOAD: %s", msg)
if len(file_names) == 1: # Return a string if there is only one file
file_names = file_names[0]
return file_names
get_date(url)
Function to get last modified date of a remote file
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
the url link |
required |
Returns:
Type | Description |
---|---|
datetime (datetime) |
the datatime object |
Source code in dhtk/core/system.py
def get_date(url: str) -> datetime.datetime:
"""Function to get last modified date of a remote file
Args:
url (str): the url link
Returns:
datetime (datetime): the datatime object
"""
headers = {'User-Agent': get_platform()}
request = requests.head(url, stream=True, headers=headers)
request = request.headers['last-modified']
last_update = datetime.datetime.strptime(request, '%a, %d %b %Y %H:%M:%S %Z')
return last_update
get_platform()
Returns computes platform.
Source code in dhtk/core/system.py
def get_platform():
"""Returns computes platform."""
return f'"({platform.system()}; U; {platform.architecture()[0]}; en-us)"'
make_dirs(directories)
Function to create new directories at DHTK's initiation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
directories |
Union[str, List[str]] |
the new directory path |
required |
Source code in dhtk/core/system.py
def make_dirs(directories: typing.Union[str, typing.List[str]]) -> None:
"""Function to create new directories at DHTK's initiation.
Args:
directories (typing.Union[str, typing.List[str]]): the new directory path
"""
# Convert values to list
if not isinstance(directories, list):
directories = [directories]
# For each directory, confirm value is a pathlib.Path object and directory doesn't exist
for directory in directories:
directory = pathlib.Path(directory)
if not directory.is_dir():
try:
directory.mkdir(parents=True, exist_ok=True)
except (IOError, PermissionError):
sys.exit(f"Could not create {directory}, please check user rights.")
pip_install(module_type, module)
Helper function to install missing modules
Parameters:
Name | Type | Description | Default |
---|---|---|---|
module_type |
str |
possible values "data_sources" or "storage" |
required |
module |
"the dhtk module name" |
required |
Returns:
Type | Description |
---|---|
the imported module. |
Source code in dhtk/core/system.py
def pip_install(module_type, module):
"""
Helper function to install missing modules
Args:
module_type (str): possible values "data_sources" or "storage"
module: "the dhtk module name"
Returns:
the imported module.
"""
if 'dummy' in module:
module_name = f"dhtk_{module_type.rstrip('s')}_{module}"
module_import = f"dhtk.{module_type}.{module}"
git_url = f"git+ssh://git@gitlab.com/dhtk/dhtk_{module_type}/examples/{module_name}"
else:
module_name = f"dhtk_{module_type}_{module}"
module_import = f"dhtk.{module_type.rstrip('s')}.{module}"
git_url = f"git+ssh://git@gitlab.com/dhtk/dhtk_{module_type}s/{module_name}"
if not IS_INTERACTIVE:
raise EnvironmentError(f"This method is for interactive usage only! Please install {module_name} manually:"
f"$ pip install {git_url}")
answer = input(f"Do you want dhtk to install {module_name}? [y/N]")
if not answer.lower().startswith("y"):
raise EnvironmentError(f"Please install the module manually: $ pip install {git_url}")
try:
cmd = ['-m', 'pip', 'install', git_url]
subprocess.check_call([sys.executable] + cmd)
except subprocess.CalledProcessError:
msg = f"Module {module_name} not available."
warnings.warn(msg)
logger.error("DATASET: " + msg)
return importlib.import_module(
module_import
)
url_exists(url)
Function to check if an url is available.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
the path to check |
required |
Source code in dhtk/core/system.py
def url_exists(url: str):
"""Function to check if an url is available.
Args:
url (str): the path to check
"""
# If the URL is a local file
if url.startswith("file:"):
return os.path.exists("/" + url.split("/", 1)[1])
# If URL is a remote file
try:
response = requests.head(url)
except requests.exceptions.ConnectionError:
return False
return response.ok
dhtk.data_sources.gutenberg
special
Gutenberg extension data_source
Module (AbstractDataSource)
Gutenberg Triplestore Class
Source code in dhtk/data_sources/gutenberg/__init__.py
class Module(AbstractDataSource):
"""Gutenberg Triplestore Class"""
name = "gutenberg"
storage_type = "triplestore"
data_file = "https://sandbox.zenodo.org/record/967894/files/gutenberg-dhtk-reasoned.ttl?download=1"
@classmethod
def get_data_file(cls, output_path, storage_type):
"""
Get a content as defined in self.data_file and write into a file into output_path
Args:
output_path (Path): the path where to write (i.e. output_path = Path('WD/gutenberg/data/triplestore'))
storage_type (str): the type of the storage
Returns:
Path: the path to the file with data
"""
#output_path can be like this output_path = 'WD/gutenberg/data/triplestore'
#storage_type can be like this storage_type= 'triplestore'
if isinstance(cls.storage_type, str):
data_file = cls.data_file
else:
data_file = cls.storage_type[cls.storage_type.index(storage_type)]
download_files(data_file, output_path, "gutenberg.ttl")
return output_path / "gutenberg.ttl"
def __init__(self, working_directory, endpoints):
# Get the extension
self.wrapper = GutenbergData(sparql_endpoint=endpoints[0])
# Instantiate a Corpus
self._corpus = GutenbergCorpus(
name=f"{self.name.capitalize()} Corpus",
working_directory=working_directory,
description="Gutenberg Books"
)
def welcome(self):
stats = None
for check in range(10):
try:
stats = self.wrapper.statistics()
break
except (RemoteDisconnected, URLError, ConnectionResetError):
if check >= 9:
warnings.warn("WARNING: There is a problem with the connection!")
print("Probably Docker is slow to restart!")
stats = "\nNo statistics available"
break
sleep(10)
def get(self, what, name="all", add=False):
"""
Extension wrapper method to call all DHTK functionalities with a simple syntax
Parameters
----------
what: string
Type of information to retrieve.
DHTK Gutenberg has the options to search for books, authors, shelves and subjects
name: string [default: "all"]
Name identifying the specific information to retrieve.
If all, retrieve all information available
add: boolean [default: False]
Add query results to Corpus
Returns
-------
Requested book information from Gutenberg dataset
"""
# Prepare arguments
name = name.strip().lower()
what = what.strip().lower()
if what.startswith("bo"):
if name == "all":
response = self.wrapper.all_books()
else:
response = self.wrapper.search_by_title(name)
response = [self.wrapper.get_book(book["book_id"]) for book in response]
if add:
self._corpus.add_books(response)
# Use a short book title
response = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book
for book in response}
elif what.startswith("au"):
if name == "all":
response = self.wrapper.all_authors()
else:
response = self.wrapper.search_by_author(name)
ids = set(author["author_id"] for author in response)
response = [self.wrapper.get_author(author) for author in ids]
if add:
for author in response:
for book in author.get_bibliography():
books = self.get(what="book", name=book)
books = {title: book for title, book in books.items() \
if book.get_author() == author}
# Match short book key name
sub_title = re.sub(r"\s+", " ", book[:20])
books = [book for key, book in books.items() if sub_title in key]
self._corpus.add_books(books)
response = {f"{author.get_full_name()}": author for author in response}
elif what.startswith("sh"):
if name == "all":
response = self.wrapper.all_bookshelves()
else:
response = self.wrapper.search_by_bookshelf(name)
if add:
add = [self.wrapper.get_book(book["book_id"]) for book in response]
self._corpus.add_books(add)
results = {}
for shelf in response:
book = f"{shelf['title']} [{shelf['author']}]"
results[shelf['bookshelf']] = results.get(shelf['bookshelf'], [])
if book not in results[shelf['bookshelf']]:
results[shelf['bookshelf']] += [book]
response = results
elif what.startswith("su"):
if name == "all":
response = self.wrapper.all_subjects()
else:
response = self.wrapper.search_by_subject(name)
if add:
add = [self.wrapper.get_book(book["book_id"]) for book in response]
self._corpus.add_books(add)
results = {}
for subject in response:
book = f"{subject['title']} [{subject['author']}]"
results[subject['subject']] = results.get(subject['subject'], [])
if book not in results[subject['subject']]:
results[subject['subject']] += [book]
response = results
else:
warnings.warn("Not a valid option")
print("Allowed arguments are:\nbook\nauthor\nshelf\nsubject")
if not response:
warnings.warn(f"No {what} found.\n"
"Please make sure there are no spelling mistakes.\n"
"You may search all available options by ignoring the [name] argument")
return response
def corpus(self, name=None, description=None):
if name is not None:
self._corpus.name(name)
if description is not None:
self._corpus.description(description)
return self._corpus
def save(self, close=True):
# Prepare save directory
path = pathlib.Path(self._corpus.path())
if not path.is_dir():
make_dirs(path)
# Save Gutenberg Texts
self._corpus.download_corpus()
# Save metadata table
self._corpus.download_metadata()
# Save Python Query Object
with open(path / "corpus.pk", 'wb') as pickled_file:
pickle.dump(self._corpus.to_pandas_dataframe(),
pickled_file, protocol=pickle.HIGHEST_PROTOCOL)
pickled_file.close()
if close:
self.close()
get(self, what, name='all', add=False)
Extension wrapper method to call all DHTK functionalities with a simple syntax
Parameters
!!! what "string" Type of information to retrieve. DHTK Gutenberg has the options to search for books, authors, shelves and subjects !!! name "string [default: "all"]" Name identifying the specific information to retrieve. If all, retrieve all information available !!! add "boolean [default: False]" Add query results to Corpus
Returns
Requested book information from Gutenberg dataset
Source code in dhtk/data_sources/gutenberg/__init__.py
def get(self, what, name="all", add=False):
"""
Extension wrapper method to call all DHTK functionalities with a simple syntax
Parameters
----------
what: string
Type of information to retrieve.
DHTK Gutenberg has the options to search for books, authors, shelves and subjects
name: string [default: "all"]
Name identifying the specific information to retrieve.
If all, retrieve all information available
add: boolean [default: False]
Add query results to Corpus
Returns
-------
Requested book information from Gutenberg dataset
"""
# Prepare arguments
name = name.strip().lower()
what = what.strip().lower()
if what.startswith("bo"):
if name == "all":
response = self.wrapper.all_books()
else:
response = self.wrapper.search_by_title(name)
response = [self.wrapper.get_book(book["book_id"]) for book in response]
if add:
self._corpus.add_books(response)
# Use a short book title
response = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book
for book in response}
elif what.startswith("au"):
if name == "all":
response = self.wrapper.all_authors()
else:
response = self.wrapper.search_by_author(name)
ids = set(author["author_id"] for author in response)
response = [self.wrapper.get_author(author) for author in ids]
if add:
for author in response:
for book in author.get_bibliography():
books = self.get(what="book", name=book)
books = {title: book for title, book in books.items() \
if book.get_author() == author}
# Match short book key name
sub_title = re.sub(r"\s+", " ", book[:20])
books = [book for key, book in books.items() if sub_title in key]
self._corpus.add_books(books)
response = {f"{author.get_full_name()}": author for author in response}
elif what.startswith("sh"):
if name == "all":
response = self.wrapper.all_bookshelves()
else:
response = self.wrapper.search_by_bookshelf(name)
if add:
add = [self.wrapper.get_book(book["book_id"]) for book in response]
self._corpus.add_books(add)
results = {}
for shelf in response:
book = f"{shelf['title']} [{shelf['author']}]"
results[shelf['bookshelf']] = results.get(shelf['bookshelf'], [])
if book not in results[shelf['bookshelf']]:
results[shelf['bookshelf']] += [book]
response = results
elif what.startswith("su"):
if name == "all":
response = self.wrapper.all_subjects()
else:
response = self.wrapper.search_by_subject(name)
if add:
add = [self.wrapper.get_book(book["book_id"]) for book in response]
self._corpus.add_books(add)
results = {}
for subject in response:
book = f"{subject['title']} [{subject['author']}]"
results[subject['subject']] = results.get(subject['subject'], [])
if book not in results[subject['subject']]:
results[subject['subject']] += [book]
response = results
else:
warnings.warn("Not a valid option")
print("Allowed arguments are:\nbook\nauthor\nshelf\nsubject")
if not response:
warnings.warn(f"No {what} found.\n"
"Please make sure there are no spelling mistakes.\n"
"You may search all available options by ignoring the [name] argument")
return response
get_data_file(output_path, storage_type)
classmethod
Get a content as defined in self.data_file and write into a file into output_path
Parameters:
Name | Type | Description | Default |
---|---|---|---|
output_path |
Path |
the path where to write (i.e. output_path = Path('WD/gutenberg/data/triplestore')) |
required |
storage_type |
str |
the type of the storage |
required |
Returns:
Type | Description |
---|---|
Path |
the path to the file with data |
Source code in dhtk/data_sources/gutenberg/__init__.py
@classmethod
def get_data_file(cls, output_path, storage_type):
"""
Get a content as defined in self.data_file and write into a file into output_path
Args:
output_path (Path): the path where to write (i.e. output_path = Path('WD/gutenberg/data/triplestore'))
storage_type (str): the type of the storage
Returns:
Path: the path to the file with data
"""
#output_path can be like this output_path = 'WD/gutenberg/data/triplestore'
#storage_type can be like this storage_type= 'triplestore'
if isinstance(cls.storage_type, str):
data_file = cls.data_file
else:
data_file = cls.storage_type[cls.storage_type.index(storage_type)]
download_files(data_file, output_path, "gutenberg.ttl")
return output_path / "gutenberg.ttl"
api
special
corpus
GutenbergCorpus (Corpus)
Class to create a corpus from books of type dhtk.common.Book
.
!!! notes The corpus can be created by a list of books. The list of books can be gotten by applying different filters,like searching a list of books by:
- subject :class:`Data.search_by_subject()`,
- author :class:`Data.search_by_author()`,
- titre :class:`Data.search_by_title()`,
- bookshelves :class:`Data.search_by_bookshelves()`.
Corpus has a name, description, a path to save and a list of selective books.
A single book or even a list of books can be added to the corpus created.
It is also possible to remove a book or all the books from corpus.
Download a single book or the whole books of the corpus in a local machine
can be also done by this class.
Examples:
import os >>> from pprint import pprint >>> from dhtk.data_sources.templates.corpus import Corpus >>> from dhtk.data_sources.gutenberg.api.data import GutenbergData # Initialise class GutenbergData as gutenberg_search. >>> gutenberg_search = GutenbergData() # Data the books by bookshelf and store the id of books in the list >>> books_found = gutenberg_search.search_by_author("Jane","Austen") >>> book_ids = [item['book_id'] for item in books_found] >>> books = set() # Get only the 4 first books found. >>> for book_id in book_ids[0:4]: >>> books.add(gutenberg_search.book_from_book_id(book_id)) # Create the corpus. >>> corpus = Corpus( >>> "jane_austen", >>> description="Books by Jane Austen", >>> corpora_path=os.path.expanduser("~/Desktop/"), >>> book_list=books >>> ) >>> corpus.print_book_list() # 0 Jane Austen Emma # 1 Jane Austen Gevoel en verstand # 2 Jane Austen Emma # 3 Jane Austen Lady Susan
Source code in dhtk/data_sources/gutenberg/api/corpus.py
class GutenbergCorpus(Corpus):
"""Class to create a corpus from books of type `dhtk.common.Book`.
Notes:
The corpus can be created by a list of books. The list of books can be gotten by applying different filters,like searching a list of books by:
- subject :class:`Data.search_by_subject()`,
- author :class:`Data.search_by_author()`,
- titre :class:`Data.search_by_title()`,
- bookshelves :class:`Data.search_by_bookshelves()`.
Corpus has a name, description, a path to save and a list of selective books.
A single book or even a list of books can be added to the corpus created.
It is also possible to remove a book or all the books from corpus.
Download a single book or the whole books of the corpus in a local machine
can be also done by this class.
Example:
Args:
Returns:
>>> import os
>>> from pprint import pprint
>>> from dhtk.data_sources.templates.corpus import Corpus
>>> from dhtk.data_sources.gutenberg.api.data import GutenbergData
# Initialise class GutenbergData as gutenberg_search.
>>> gutenberg_search = GutenbergData()
# Data the books by bookshelf and store the id of books in the list
>>> books_found = gutenberg_search.search_by_author("Jane","Austen")
>>> book_ids = [item['book_id'] for item in books_found]
>>> books = set()
# Get only the 4 first books found.
>>> for book_id in book_ids[0:4]:
>>> books.add(gutenberg_search.book_from_book_id(book_id))
# Create the corpus.
>>> corpus = Corpus(
>>> "jane_austen",
>>> description="Books by Jane Austen",
>>> corpora_path=os.path.expanduser("~/Desktop/"),
>>> book_list=books
>>> )
>>> corpus.print_book_list()
# 0 Jane Austen Emma
# 1 Jane Austen Gevoel en verstand
# 2 Jane Austen Emma
# 3 Jane Austen Lady Susan
"""
# TODO: convertors: metadata -> sqllite | texts -" directory
def __init__(self, name, working_directory, description="", corpora_path=None, book_list=None):
"""
Initiate a corpus class
Args:
name (str): Name of the corpus.
working_directory (str): The working directory path
description (str,optional): A description of the corpus. Default to "".
corpora_path (str,optional): Path where the texts of the books in the corpus are saved. Default to None:
book_list (list[dhtk.common.book.Book], optional: A list of common.book.Book and/or child tools of it.Default to None:
"""
self._name = name
self._description = description
# If there is a list of books, add the book to the list.
if book_list:
self._book_list = [book for book in book_list if book]
# If no list, create one (creating a set in order to avoid duplication).
else:
self._book_list = []
# If any path exist, ask for a directory and create one.
self._corpora_path = corpora_path
if not corpora_path:
self.path = pathlib.Path(working_directory) / "requests"
# Settings
def path(self, path=None):
"""Sets and returns the path containing the text files of the books in the corpus.
Args:
path(str, optional): the path dir. Default to None
Returns:
str: Path of corpus with the name of directory and the name of corpus.
Examples:
>>> corpus.path()
# '~Desktop/jane_austen'
"""
if path is not None:
self._corpora_path = path
name = self._name.split()
return self._corpora_path / "_".join(name)
def name(self, name=None):
"""
Args:
name(str, optional): The corpus file will be saved in the local machine with this name. (Default value = None)
Returns:
str: the name of corpus.
Examples:
>>> corpus.get_name()
# 'jane_austen'
"""
if name is not None:
self._name = name
return self._name
def description(self, description=None):
"""Sets and returns the description of the corpus.
Args:
description(str, optional): The description of the corpus (Default value = None)
Returns:
str: The description of the corpus
Examples:
>>> corpus.description()
# 'Books by Jane Austen'
"""
if description is not None:
self._description = description
return self._description
# Books
def books(self, get=False, remove=False):
"""
Returns the list of books in the corpus. Print list of books in the corpus.
This list contains the number of books in corpus, authors' full name and the books title.
Args:
get: Default value = False)
remove: Default value = False)
Returns:
"""
#TODO: fix the docstring
"""
Args:
get:
remove:
Returns:
Examples:
>>> pprint(corpus.get_book_list())
# [<GutenbergBook: Jane Austen - Emma gutenberg_id: 158>,
# <GutenbergBook: Jane Austen - Gevoel en verstand gutenberg_id: 25946>,
# <GutenbergBook: Jane Austen - Emma gutenberg_id: 19839>,
# <GutenbergBook: Jane Austen - Lady Susan gutenberg_id: 22953>]
"""
if get:
books = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book for book in self._book_list
if book and (get == book.get_title() or # Full title
get == f"{book.get_title()[:20]} ({book.get_book_id_number()})" or # Short title
get == "all")} # Accept all
if remove:
for book in books.values():
self.remove_book(book)
else:
if len(books) == 1:
books = list(books.values())[0]
return books
else:
for index, book in enumerate(self._book_list):
author = book.get_author()
author_full_name = author.get_full_name()
book_title = book.get_title()
print("{} {} {}".format(index, author_full_name, book_title))
def add_book(self, book):
"""Add a single book to the corpus.
Args:
book(dhtk.common.book.Book): The book instance`.
Examples:
Get a single book by gutenberg id.
>>> len(corpus)
# 4
>>> book = gutenberg_search.book_from_book_id(book_ids[5])
>>> corpus.add_book(book)
>>> len(corpus)
# 5
"""
# If the object is a book, add it to the corpus that already exist.
if book and book not in self._book_list:
self._book_list.append(book)
# if the object is not a book, send an error.
# else:
# LOGGER.error("This is not a book.")
def add_books(self, book_list):
"""Add a list of books to the corpus.
Args:
book_list(list: list: list[dhtk.common.book.Book]): A list of books instances
Examples:
>>> len(corpus)
# 5
>>> books = [gutenberg_search.book_from_book_id(book_id) for book_id in book_ids[6:10]]
>>> corpus.add_books(books)
>>> len(corpus)
# 9
"""
for book in book_list:
self.add_book(book)
@staticmethod
def get_book_file_name(book):
"""Return a good filename for a book.
Args:
book (dhtk.common.book.Book): It is the book from searching in gutenberg and get book id using the method `book_from_book_id()`.
Returns:
"""
return book.get_text_file_name()
def remove_book(self, book):
"""Delete a book form the corpus by deleting the book form list of books and also the file of book in the local machine.
Args:
book (dhtk.common.book.Book): The book to be removed
Returns:
"""
self._book_list.remove(book)
file_path = self.path() / self.get_book_file_name(book)
if file_path.is_file():
file_path.unlink()
def clear(self):
"""Delete all files and books in the corpus."""
folder = pathlib.Path(self.path())
if folder.is_dir():
for content in folder.iterdir():
content.unlink()
folder.rmdir()
self._book_list.clear()
def download_book(self, book):
"""Download the text file for a single book and save it into the corpus repository
Args:
book ((dhtk.common.book.Book): The book whose text you need to download
Returns:
"""
corpus_path = self.path()
if not corpus_path.is_dir():
make_dirs(corpus_path)
filename = book.get_text_file_name()
path = corpus_path / filename
if not path.is_file():
book.repository().save_clean_text_file_to(corpus_path)
sleep(5)
# else:
# LOGGER.info("File %s already exists in %s.", filename, corpus_path)
def download_corpus(self):
"""Download the text files for the full corpus to the corpus path directory.
Examples:
>>> corpus.download_corpus()
>>> pprint(os.listdir(corpus.get_corpus_path()))
# ['158-gutenberg.txt',
# '19839-gutenberg.txt',
# '121-gutenberg.txt',
# '22954-gutenberg.txt',
# '1212-gutenberg.txt',
# '25946-gutenberg.txt',
# '22962-gutenberg.txt',
# '22953-gutenberg.txt']
"""
# Save the original text of books
for book in self._book_list:
self.download_book(book)
def download_metadata(self, filename="books_metadata.csv"):
"""Download the metadata for the full corpus into a csv file
Args:
filename (str): the filename for the CSV file. Default to "books_metadata.csv"
"""
# Save the original text of books
corpus_path = self.path()
if not corpus_path.is_dir():
make_dirs(corpus_path)
# Save book metadata
pandas_table = self.to_pandas_dataframe()
pandas_table.to_csv(path_or_buf=corpus_path / filename, index=False)
# Save author metadata
authors = set()
for book in self._book_list:
authors.add(book.get_author())
authors_meta = {}
for author in authors:
authors_meta[author.get_full_name()] = author.to_dict()
panda_author = pd.DataFrame.from_dict(authors_meta, orient="index")
names = panda_author["name"]
panda_author.drop(columns="name", inplace=True)
panda_author.insert(0, 'name', names)
panda_author.to_csv(path_or_buf=corpus_path / "authors_metadata.csv", index=False)
def to_dict(self):
"""Convert to python dict for general purpose."""
corpus_dict = dict()
for index, book in enumerate(self._book_list):
book_dict = book.to_dict()
filename = book.get_text_file_name()
file_path = self.path() / filename
if file_path.is_file():
book_dict["text_file_path"] = file_path
corpus_dict[index] = book_dict
return corpus_dict
def to_pandas_dataframe(self):
"""Convert the list of books into a pandas.DataFrame."""
import pandas as pd
book_list = list()
for book in self._book_list:
book_dict = book.to_dict()
filename = book.get_text_file_name()
file_path = self.path() / filename
if file_path.is_file():
book_dict["text_file_path"] = file_path
book_list.append(book_dict)
return pd.DataFrame.from_dict(book_list)
def __iter__(self):
"""
Add capability to iterate over books in corpus.
Returns:
iterator (iter) : An iterator over the books in the corpuse's booklist.
"""
for book in self._book_list:
yield book
def __len__(self):
"""
List length.
Returns:
len (int): The number of books in the corpus.
"""
return len(self._book_list)
def __repr__(self):
"""
Convert book_list in string format.
Returns:
str : A string of books in the list with information like the number of book in th list, author's name and the titre of book.
"""
# Add padding to the authors'name in order to have a clean string
max_author_name_len = max([
len(book.get_author().get_full_name()) for book in self._book_list
]) + 4
format_string = "{}\t{:" + str(max_author_name_len) + "}\t{}"
text = [f"Corpus name:\n\t{self.name()}",
f"Corpus description:\n\t{self.description()}",
f"Corpus path:\n\t{self.path()}",
"Corpus books:"]
text += [format_string.format(
i, book.get_author().get_full_name(), book.get_title()
) for i, book in enumerate(self._book_list)]
return "\n".join(text)
def __getitem__(self, item):
"""
Return the item requested.
Args:
item (int): The index of the book in the corpus
Returns:
str: The book requested
"""
return self._book_list[item]
__getitem__(self, item)
special
Return the item requested.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
int |
The index of the book in the corpus |
required |
Returns:
Type | Description |
---|---|
str |
The book requested |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __getitem__(self, item):
"""
Return the item requested.
Args:
item (int): The index of the book in the corpus
Returns:
str: The book requested
"""
return self._book_list[item]
__init__(self, name, working_directory, description='', corpora_path=None, book_list=None)
special
Initiate a corpus class
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name |
str |
Name of the corpus. |
required |
working_directory |
str |
The working directory path |
required |
description |
str,optional |
A description of the corpus. Default to "". |
'' |
corpora_path |
str,optional |
Path where the texts of the books in the corpus are saved. Default to None: |
None |
book_list |
list[dhtk.common.book.Book] |
A list of common.book.Book and/or child tools of it.Default to None: |
None |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __init__(self, name, working_directory, description="", corpora_path=None, book_list=None):
"""
Initiate a corpus class
Args:
name (str): Name of the corpus.
working_directory (str): The working directory path
description (str,optional): A description of the corpus. Default to "".
corpora_path (str,optional): Path where the texts of the books in the corpus are saved. Default to None:
book_list (list[dhtk.common.book.Book], optional: A list of common.book.Book and/or child tools of it.Default to None:
"""
self._name = name
self._description = description
# If there is a list of books, add the book to the list.
if book_list:
self._book_list = [book for book in book_list if book]
# If no list, create one (creating a set in order to avoid duplication).
else:
self._book_list = []
# If any path exist, ask for a directory and create one.
self._corpora_path = corpora_path
if not corpora_path:
self.path = pathlib.Path(working_directory) / "requests"
__iter__(self)
special
Add capability to iterate over books in corpus.
Returns:
Type | Description |
---|---|
iterator (iter) |
An iterator over the books in the corpuse's booklist. |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __iter__(self):
"""
Add capability to iterate over books in corpus.
Returns:
iterator (iter) : An iterator over the books in the corpuse's booklist.
"""
for book in self._book_list:
yield book
__len__(self)
special
List length.
Returns:
Type | Description |
---|---|
len (int) |
The number of books in the corpus. |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __len__(self):
"""
List length.
Returns:
len (int): The number of books in the corpus.
"""
return len(self._book_list)
__repr__(self)
special
Convert book_list in string format.
Returns:
Type | Description |
---|---|
str |
A string of books in the list with information like the number of book in th list, author's name and the titre of book. |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def __repr__(self):
"""
Convert book_list in string format.
Returns:
str : A string of books in the list with information like the number of book in th list, author's name and the titre of book.
"""
# Add padding to the authors'name in order to have a clean string
max_author_name_len = max([
len(book.get_author().get_full_name()) for book in self._book_list
]) + 4
format_string = "{}\t{:" + str(max_author_name_len) + "}\t{}"
text = [f"Corpus name:\n\t{self.name()}",
f"Corpus description:\n\t{self.description()}",
f"Corpus path:\n\t{self.path()}",
"Corpus books:"]
text += [format_string.format(
i, book.get_author().get_full_name(), book.get_title()
) for i, book in enumerate(self._book_list)]
return "\n".join(text)
add_book(self, book)
Add a single book to the corpus.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book(dhtk.common.book.Book) |
The book instance`. |
required |
Examples:
Get a single book by gutenberg id. >>> len(corpus) # 4 >>> book = gutenberg_search.book_from_book_id(book_ids[5]) >>> corpus.add_book(book) >>> len(corpus) # 5
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def add_book(self, book):
"""Add a single book to the corpus.
Args:
book(dhtk.common.book.Book): The book instance`.
Examples:
Get a single book by gutenberg id.
>>> len(corpus)
# 4
>>> book = gutenberg_search.book_from_book_id(book_ids[5])
>>> corpus.add_book(book)
>>> len(corpus)
# 5
"""
# If the object is a book, add it to the corpus that already exist.
if book and book not in self._book_list:
self._book_list.append(book)
# if the object is not a book, send an error.
# else:
# LOGGER.error("This is not a book.")
add_books(self, book_list)
Add a list of books to the corpus.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book_list(list |
list: list[dhtk.common.book.Book]): A list of books instances |
required |
Examples:
>>> len(corpus)
# 5
>>> books = [gutenberg_search.book_from_book_id(book_id) for book_id in book_ids[6:10]]
>>> corpus.add_books(books)
>>> len(corpus)
# 9
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def add_books(self, book_list):
"""Add a list of books to the corpus.
Args:
book_list(list: list: list[dhtk.common.book.Book]): A list of books instances
Examples:
>>> len(corpus)
# 5
>>> books = [gutenberg_search.book_from_book_id(book_id) for book_id in book_ids[6:10]]
>>> corpus.add_books(books)
>>> len(corpus)
# 9
"""
for book in book_list:
self.add_book(book)
books(self, get=False, remove=False)
Returns the list of books in the corpus. Print list of books in the corpus. This list contains the number of books in corpus, authors' full name and the books title.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
get |
Default value = False) |
False |
|
remove |
Default value = False) |
False |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def books(self, get=False, remove=False):
"""
Returns the list of books in the corpus. Print list of books in the corpus.
This list contains the number of books in corpus, authors' full name and the books title.
Args:
get: Default value = False)
remove: Default value = False)
Returns:
"""
#TODO: fix the docstring
"""
Args:
get:
remove:
Returns:
Examples:
>>> pprint(corpus.get_book_list())
# [<GutenbergBook: Jane Austen - Emma gutenberg_id: 158>,
# <GutenbergBook: Jane Austen - Gevoel en verstand gutenberg_id: 25946>,
# <GutenbergBook: Jane Austen - Emma gutenberg_id: 19839>,
# <GutenbergBook: Jane Austen - Lady Susan gutenberg_id: 22953>]
"""
if get:
books = {f"{book.get_title()[:20]} ({book.get_book_id_number()})": book for book in self._book_list
if book and (get == book.get_title() or # Full title
get == f"{book.get_title()[:20]} ({book.get_book_id_number()})" or # Short title
get == "all")} # Accept all
if remove:
for book in books.values():
self.remove_book(book)
else:
if len(books) == 1:
books = list(books.values())[0]
return books
else:
for index, book in enumerate(self._book_list):
author = book.get_author()
author_full_name = author.get_full_name()
book_title = book.get_title()
print("{} {} {}".format(index, author_full_name, book_title))
clear(self)
Delete all files and books in the corpus.
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def clear(self):
"""Delete all files and books in the corpus."""
folder = pathlib.Path(self.path())
if folder.is_dir():
for content in folder.iterdir():
content.unlink()
folder.rmdir()
self._book_list.clear()
description(self, description=None)
Sets and returns the description of the corpus.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
description(str, |
optional |
The description of the corpus (Default value = None) |
required |
Returns:
Type | Description |
---|---|
str |
The description of the corpus |
Examples:
>>> corpus.description()
# 'Books by Jane Austen'
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def description(self, description=None):
"""Sets and returns the description of the corpus.
Args:
description(str, optional): The description of the corpus (Default value = None)
Returns:
str: The description of the corpus
Examples:
>>> corpus.description()
# 'Books by Jane Austen'
"""
if description is not None:
self._description = description
return self._description
download_book(self, book)
Download the text file for a single book and save it into the corpus repository
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book |
dhtk.common.book.Book |
The book whose text you need to download |
required |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def download_book(self, book):
"""Download the text file for a single book and save it into the corpus repository
Args:
book ((dhtk.common.book.Book): The book whose text you need to download
Returns:
"""
corpus_path = self.path()
if not corpus_path.is_dir():
make_dirs(corpus_path)
filename = book.get_text_file_name()
path = corpus_path / filename
if not path.is_file():
book.repository().save_clean_text_file_to(corpus_path)
sleep(5)
# else:
# LOGGER.info("File %s already exists in %s.", filename, corpus_path)
download_corpus(self)
Download the text files for the full corpus to the corpus path directory.
Examples:
>>> corpus.download_corpus()
>>> pprint(os.listdir(corpus.get_corpus_path()))
# ['158-gutenberg.txt',
# '19839-gutenberg.txt',
# '121-gutenberg.txt',
# '22954-gutenberg.txt',
# '1212-gutenberg.txt',
# '25946-gutenberg.txt',
# '22962-gutenberg.txt',
# '22953-gutenberg.txt']
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def download_corpus(self):
"""Download the text files for the full corpus to the corpus path directory.
Examples:
>>> corpus.download_corpus()
>>> pprint(os.listdir(corpus.get_corpus_path()))
# ['158-gutenberg.txt',
# '19839-gutenberg.txt',
# '121-gutenberg.txt',
# '22954-gutenberg.txt',
# '1212-gutenberg.txt',
# '25946-gutenberg.txt',
# '22962-gutenberg.txt',
# '22953-gutenberg.txt']
"""
# Save the original text of books
for book in self._book_list:
self.download_book(book)
download_metadata(self, filename='books_metadata.csv')
Download the metadata for the full corpus into a csv file
Parameters:
Name | Type | Description | Default |
---|---|---|---|
filename |
str |
the filename for the CSV file. Default to "books_metadata.csv" |
'books_metadata.csv' |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def download_metadata(self, filename="books_metadata.csv"):
"""Download the metadata for the full corpus into a csv file
Args:
filename (str): the filename for the CSV file. Default to "books_metadata.csv"
"""
# Save the original text of books
corpus_path = self.path()
if not corpus_path.is_dir():
make_dirs(corpus_path)
# Save book metadata
pandas_table = self.to_pandas_dataframe()
pandas_table.to_csv(path_or_buf=corpus_path / filename, index=False)
# Save author metadata
authors = set()
for book in self._book_list:
authors.add(book.get_author())
authors_meta = {}
for author in authors:
authors_meta[author.get_full_name()] = author.to_dict()
panda_author = pd.DataFrame.from_dict(authors_meta, orient="index")
names = panda_author["name"]
panda_author.drop(columns="name", inplace=True)
panda_author.insert(0, 'name', names)
panda_author.to_csv(path_or_buf=corpus_path / "authors_metadata.csv", index=False)
get_book_file_name(book)
staticmethod
Return a good filename for a book.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book |
dhtk.common.book.Book |
It is the book from searching in gutenberg and get book id using the method |
required |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
@staticmethod
def get_book_file_name(book):
"""Return a good filename for a book.
Args:
book (dhtk.common.book.Book): It is the book from searching in gutenberg and get book id using the method `book_from_book_id()`.
Returns:
"""
return book.get_text_file_name()
name(self, name=None)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name(str, |
optional |
The corpus file will be saved in the local machine with this name. (Default value = None) |
required |
Returns:
Type | Description |
---|---|
str |
the name of corpus. |
Examples:
>>> corpus.get_name()
# 'jane_austen'
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def name(self, name=None):
"""
Args:
name(str, optional): The corpus file will be saved in the local machine with this name. (Default value = None)
Returns:
str: the name of corpus.
Examples:
>>> corpus.get_name()
# 'jane_austen'
"""
if name is not None:
self._name = name
return self._name
path(self, path=None)
Sets and returns the path containing the text files of the books in the corpus.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path(str, |
optional |
the path dir. Default to None |
required |
Returns:
Type | Description |
---|---|
str |
Path of corpus with the name of directory and the name of corpus. Examples: |
corpus.path() # '~Desktop/jane_austen'
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def path(self, path=None):
"""Sets and returns the path containing the text files of the books in the corpus.
Args:
path(str, optional): the path dir. Default to None
Returns:
str: Path of corpus with the name of directory and the name of corpus.
Examples:
>>> corpus.path()
# '~Desktop/jane_austen'
"""
if path is not None:
self._corpora_path = path
name = self._name.split()
return self._corpora_path / "_".join(name)
remove_book(self, book)
Delete a book form the corpus by deleting the book form list of books and also the file of book in the local machine.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book |
dhtk.common.book.Book |
The book to be removed |
required |
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def remove_book(self, book):
"""Delete a book form the corpus by deleting the book form list of books and also the file of book in the local machine.
Args:
book (dhtk.common.book.Book): The book to be removed
Returns:
"""
self._book_list.remove(book)
file_path = self.path() / self.get_book_file_name(book)
if file_path.is_file():
file_path.unlink()
to_dict(self)
Convert to python dict for general purpose.
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def to_dict(self):
"""Convert to python dict for general purpose."""
corpus_dict = dict()
for index, book in enumerate(self._book_list):
book_dict = book.to_dict()
filename = book.get_text_file_name()
file_path = self.path() / filename
if file_path.is_file():
book_dict["text_file_path"] = file_path
corpus_dict[index] = book_dict
return corpus_dict
to_pandas_dataframe(self)
Convert the list of books into a pandas.DataFrame.
Source code in dhtk/data_sources/gutenberg/api/corpus.py
def to_pandas_dataframe(self):
"""Convert the list of books into a pandas.DataFrame."""
import pandas as pd
book_list = list()
for book in self._book_list:
book_dict = book.to_dict()
filename = book.get_text_file_name()
file_path = self.path() / filename
if file_path.is_file():
book_dict["text_file_path"] = file_path
book_list.append(book_dict)
return pd.DataFrame.from_dict(book_list)
data
Contains the GutenbergData implementation of the abstract LiteraryData class
GutenbergData (Data)
Class to searching the Gutenberg catalog using SPARQL queries, inheriting from the Abstract class LiteraryData (dhtk.data_sources.abstract_gutenberg)
"Query" attributes participate to create a skeleton of a standard query : query_header + query_select + query_head.
Source code in dhtk/data_sources/gutenberg/api/data.py
class GutenbergData(Data):
"""Class to searching the Gutenberg catalog using SPARQL queries,
inheriting from the Abstract class LiteraryData (dhtk.data_sources.abstract_gutenberg)
"Query" attributes participate to create a skeleton of a standard query :
query_header + query_select + query_head.
Args:
Returns:
"""
# TODO: implement different types than text! '?book_id dcterms:type dcmitype:Text.'
# TODO: add method to search book when author is known.
_namespace = "\n".join([
"PREFIX dcterms: <http://purl.org/dc/terms/>",
"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>",
"PREFIX purl: <http://purl.org/dc/terms/>",
"PREFIX owl: <http://www.w3.org/2002/07/owl#>",
"PREFIX pgterms: <http://www.gutenberg.org/2009/pgterms/>",
"PREFIX foaf: <http://xmlns.com/foaf/0.1/>",
"PREFIX marcrel: <http://id.loc.gov/vocabulary/relators/>",
"PREFIX dcmitype: <http://purl.org/dc/dcmitype/>\n",
])
_work_types = [
"Text",
"Image",
"Sound",
"Dataset",
"StillImage",
"Collection",
"MovingImage",
]
_search_cache = dict()
_book_metadata = ("""<%s> rdf:type ?gutenberg_type . """,
"<%s> pgterms:downloads ?gutenberg_downloads .",
"<%s> dcterms:publisher ?gutenberg_publisher .",
"<%s> dcterms:hasFormat ?gutenberg_hasFormat .",
"""<%s> dcterms:language [rdf:value ?gutenberg_language] .""",
"""<%s> dcterms:subject [rdf:valuowl:sameAse ?gutenberg_subject] .""",
"""<%s> dcterms:type [rdf:value ?gutenberg_media_type] .""",
"<%s> dcterms:rights ?gutenberg_rights .",
"<%s> dcterms:title ?gutenberg_title .",
"<%s> dcterms:issued ?gutenberg_issued .",
"<%s> dcterms:creator ?gutenberg_creator .",
"<%s> dcterms:license ?gutenberg_license .",
"<%s> dcterms:tableOfContents ?gutenberg_tableOfContents .",
"<%s> pgterms:marc010 ?gutenberg_marc010 .",
"<%s> pgterms:marc901 ?gutenberg_marc901 .",
"""<%s> pgterms:bookshelf [rdf:value ?gutenberg_bookshelf] .""",
"<%s> pgterms:marc440 ?gutenberg_marc440 .",
"<%s> dcterms:description ?gutenberg_description .",
"<%s> marcrel:trl ?gutenberg_trl .",
"<%s> dcterms:alternative ?gutenberg_alternative .",
"<%s> marcrel:edt ?gutenberg_edt .",
"<%s> marcrel:aui ?gutenberg_aui .",
"<%s> marcrel:pbl ?gutenberg_pbl .",
"<%s> marcrel:ill ?gutenberg_ill .",
"<%s> marcrel:cmm ?gutenberg_cmm .",
"<%s> marcrel:com ?gutenberg_com .",
"<%s> marcrel:oth ?gutenberg_oth .",
"<%s> pgterms:marc260 ?gutenberg_marc260 .",
"<%s> marcrel:ctb ?gutenberg_ctb .",
"<%s> marcrel:ann ?gutenberg_ann .",
"<%s> marcrel:egr ?gutenberg_egr .",
"<%s> pgterms:marc508 ?gutenberg_marc508 .",
"<%s> pgterms:marc546 ?gutenberg_marc546 .",
"<%s> pgterms:marc902 ?gutenberg_marc902 .",
"<%s> pgterms:marc520 ?gutenberg_marc520 .",
"<%s> pgterms:marc903 ?gutenberg_marc903 .",
"<%s> pgterms:marc300 ?gutenberg_marc300 .",
"<%s> marcrel:adp ?gutenberg_adp .",
"<%s> marcrel:pht ?gutenberg_pht .",
"<%s> marcrel:unk ?gutenberg_unk .",
"<%s> marcrel:prt ?gutenberg_prt .",
"<%s> marcrel:prf ?gutenberg_prf .",
"<%s> pgterms:marc250 ?gutenberg_marc250 .",
"<%s> pgterms:marc020 ?gutenberg_marc020 .",
"<%s> marcrel:cmp ?gutenberg_cmp .",
"<%s> marcrel:dub ?gutenberg_dub .",
"<%s> marcrel:arr ?gutenberg_arr .",
"<%s> marcrel:trc ?gutenberg_trc .",
"<%s> marcrel:clb ?gutenberg_clb .",
"<%s> marcrel:aft ?gutenberg_aft .",
"<%s> marcrel:res ?gutenberg_res .",
"<%s> marcrel:art ?gutenberg_art .",
"<%s> owl:sameAs|foaf:isPrimaryTopicOf ?same_as .",)
_author_metadata = ("<%s> pgterms:alias ?aliases .",
"<%s> pgterms:birthdate ?birth_date .",
"<%s> pgterms:deathdate ?death_date .",
"<%s> owl:sameAs|foaf:isPrimaryTopicOf ?same_as .",
"<%s> pgterms:webpage ?web_pages .",
"<%s> rdf:type ?gutenberg_type .")
def __init__(self, sparql_endpoint):
"""Initialize tools with the SPARQL endpoint,
such as an local instance of the Apache Jena Fuseki server.
Args:
sparql_endpoint (str) : URL of the triplet store containing Gutenberg Catalog triplets.
"""
try:
self._sparql_endpoint = SPARQLWrapper(sparql_endpoint)
except Exception as error:
raise EnvironmentError(
f"Check the sparql_endpoint you provided!: {sparql_endpoint}"
) from error
logger.info(f"GUTENBERG: GutenbergData instantiated using SPARQL endpoint: {sparql_endpoint}")
# Bookshelves
_shelves = """?book_id pgterms:bookshelf [dcterms:title ?bookshelf] ."""
def all_bookshelves(self, select="SELECT DISTINCT ?bookshelf"):
"""Return all bookshelves in the store.
This method doesn't use the standard SPARQL query, but a specific to get only bookshelves.
Args:
select (str) : SPARQL query (Default value = "SELECT DISTINCT ?bookshelf")
Returns:
list : the results for the query
"""
pattern = f"""
WHERE {{
{self._shelves}
}}
ORDER BY ?bookshelf
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
if "COUNT" in select:
return query[0]["total"]
return [result["bookshelf"] for result in query]
def search_by_bookshelf(self, bookshelf):
"""Data in Gutenberg catalog all books corresponding to the given bookshelf string.
The bookshelf is used as parameter in a SPARQL query.
Args:
bookshelf(str): bookshelf in plain text, case-insensitive. It can be a part of the bookshelf.
Returns:
str: result of the query
"""
logger.info(f"GUTENBERG: Searching bookshelf: {bookshelf}")
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
{self._shelves}
FILTER CONTAINS(lcase(str(?bookshelf)), "{bookshelf.lower()}")
{self._books}
OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}
}}
ORDER BY ?author ?title
"""
query = self._namespace + select + pattern
return self._get_query_results(query)
# Subjects
_subjects = """?book_id dcterms:subject [dcterms:title ?subject]."""
def all_subjects(self, select="SELECT DISTINCT ?subject"):
"""Return all subjects in the store.
This method don't use the standard SPARQL query, but a specific to get only subjects.
Args:
select (str): The SPARQL query (Default value = "SELECT DISTINCT ?subject")
Returns:
list : a list with the query results
"""
pattern = f"""
WHERE {{
{self._subjects}
}}
ORDER BY ?subject
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
if "COUNT" in select:
return query[0]["total"]
return [result["subject"] for result in query]
def search_by_subject(self, subject, limit=0):
"""Data in Gutenberg catalog all books with given subject string.
The subject is used as parameter in a SPARQL query. If no limit value is specified,
the method returns all books with the given subject.
Args:
subject(str): Subject in plain text, case-insensitive. It can be a part of the subject.
limit(int, optional): Use to limit how many books are returned by the SPARQL query. (Default value = 0)
Returns:
str : the query results
"""
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
{self._subjects}
FILTER CONTAINS(lcase(str(?subject)), "{subject.lower()}")
{self._books}
OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}
}}
ORDER BY ?author ?title
"""
query = self._namespace + select + pattern
if limit > 0:
query += f"LIMIT {limit}"
return self._get_query_results(query)
# Authors
_authors = """?author_id a pgterms:agent;
pgterms:name|pgterms:alias ?author."""
def all_authors(self, select="SELECT DISTINCT ?author"):
"""Return all authors in the store.
This method don't use the standard SPARQL query, but a specific to get only authors.
Args:
select (str): The SPARQL query (Default value = "SELECT DISTINCT ?author")
Returns:
list : the query results
"""
pattern = f"""
WHERE {{
{self._authors}
}}
ORDER BY ?author
"""
query = self._namespace + select + pattern
if "COUNT" in select:
return self._get_query_results(query)[0]["total"]
return [result["author"] for result in self._get_query_results(query)]
def search_by_author(self, name, alias=None):
"""
Data books in the Gutenberg catalog by author's name and last name. The standard SPARQL query is overwritten by a filter and by a sort instruction.
Args:
name(str): Author's name, first name, last name or alias in plain text, case-insensitive. It can be a part of the author's name. (Default value = None)
Returns:
str : the query results
"""
# If alias is provided, looks for name AND alias
# Otherwise assume the name might be an alias
names = re.split(r'\W+', name)
filter = ""
for name in names:
filter += f'FILTER (CONTAINS(lcase(str(?author)), "{name.lower()}")'
if alias is None:
filter += f' || CONTAINS(lcase(str(?aliases)), "{name.lower()}")'
filter += ")\n"
if alias is not None:
aliases = re.split(r'\W+', alias)
for alias in aliases:
filter += f'FILTER (CONTAINS(lcase(str(?aliases)), "{alias.lower()}"))\n'
select = "SELECT DISTINCT ?author ?aliases ?author_id"
pattern = f"""
WHERE {{
{self._authors}
{filter}
OPTIONAL {{ ?author_id pgterms:alias ?aliases. }}
}}
ORDER BY ?author
"""
query = self._namespace + select + pattern
return self._get_query_results(query)
def get_author(self, id):
"""Create an author object with information collected from the Gutenberg Store.
Args:
id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'
Returns:
str : the query results
"""
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
<{id}> pgterms:name ?name .
}}
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)[0]
author = GutenbergAuthor(
gutenberg_id=id,
name=query["name"],
)
author.update_metadata(self.get_metadata(author))
author.update_metadata({"bibliography": self.get_bibliography(id)})
return author
def get_bibliography(self, id):
"""To get all books written by an author.
Args:
id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'
Returns:
list : the query results
"""
select = "SELECT DISTINCT ?title"
pattern = f"""
WHERE {{
?book_id purl:creator <{id}> .
{self._books}
}}
ORDER BY ?title
"""
query = self._namespace + select + pattern
return [result["title"] for result in self._get_query_results(query)]
# Books
_books = f"""?book_id purl:title ?title.
{_authors}
"""
def all_books(self, select="SELECT DISTINCT ?title ?author"):
"""Return the title of all books in the store.
This method don't use the standard query, but a specific to get only
titles and book identifiers.
Args:
select: (Default value = "SELECT DISTINCT ?title ?author")
Returns:
list : the query results
"""
pattern = f"""
WHERE {{
{self._books}
}}
ORDER BY ?title
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
if "COUNT" in select:
return query[0]["total"]
return [{"title": result['title'], "author": result['author']} for result in query]
def search_by_title(self, title):
"""Data in Gutenberg catalog all books with given title string.
Args:
title(str): Title in plain text, case-insensitive. It can be a part of the title.
Returns:
str : the query results
"""
select = "SELECT DISTINCT ?book_id ?title ?author_id ?author ?language"
pattern = f"""
WHERE {{
{self._books}
OPTIONAL {{?book_id dcterms:language [rdf:value ?language].}}
FILTER CONTAINS(lcase(str(?title)), {repr(title).lower()})
}}
ORDER BY ?author ?title
"""
query = self._namespace + select + pattern
return self._get_query_results(query)
def get_book(self, book_id, author=None):
"""Create a book object with information collected from the Gutenberg Store.
Args:
book_id(str): The book identifier is a URI, like 'http://www.gutenberg.org/ebooks/20063'
author: (Default value = None)
Returns:
gutenberg.tools.book : A book object
"""
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
<{book_id}> purl:title ?title;
purl:creator ?author_id;
dcterms:type dcmitype:Text.
}}"""
query = self._namespace + select + pattern
book = self._get_query_results(query)[0]
if not author:
author = self.get_author(book["author_id"])
metadata = self.bookshelves_subjects(book_id)
book = GutenbergBook(
gutenberg_id=book_id,
title=book["title"],
subject=metadata["subjects"] or None,
bookshelf=metadata["bookshelves"] or None,
author=author
)
book.update_metadata(self.get_metadata(book))
return book
def bookshelves_subjects(self, book_id):
"""Return the bookshelves and the subjects of the given book, designated by his identifier.
Args:
book_id(str): A Gutenberg book identifier. Is a URI, like "http://www.gutenberg.org/ebooks/10053"
Returns:
dict : the query results with 'bookshelves' and 'subjects'
"""
select = "SELECT DISTINCT ?subject ?bookshelf"
pattern = f"""
WHERE {{
<{book_id}> dcterms:subject [dcterms:title ?subject];
pgterms:bookshelf [dcterms:title ?bookshelf] ;
dcterms:type dcmitype:Text.
}}
ORDER BY ?subject
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
subjects = [result["subject"] for result in query]
bookshelves = [result["bookshelf"] for result in query]
return {"bookshelves": set(bookshelves), "subjects": set(subjects)}
# Queries
def _get_query_results(self, query):
"""Use a SPARQL query to get results from the triplet store.
Args:
query(str): A structured string in the SPARQL language used to ask the triplet store.
Returns:
list : the query results
"""
logger.debug(f"GUTENBERG: Executing query: \n{query}")
sparql = self._sparql_endpoint
sparql.setQuery(query)
logger.debug(query)
sparql.setReturnFormat(JSON)
# TODO: handle remote server better than this, please:
query_results = sparql.queryAndConvert()
results = list()
for entry in query_results["results"]["bindings"]:
formatted_entry = dict()
for key, value in entry.items():
formatted_entry[key] = value["value"]
results.append(formatted_entry)
return results
def get_metadata(self, item):
"""Get metadata about the book that is present in the catalog.
Args:
item(An object having an entry "gutenberg_id" in the results of the method get_metadata().): The metadata of these tools must contain an entry called: "gutenberg_id"
Returns:
dict : the query results
"""
query_results = self._metadata_query(item)
metadata = dict()
for result in query_results:
result_count = len(result)
if result_count == 1:
for key in result[0].keys():
metadata[key] = result[0][key]
elif result_count > 1:
for key in result[0].keys():
metadata[key] = [entry[key] for entry in result]
return metadata
def _metadata_query(self, item):
"""Helper function to get metadata for different item types.
Args:
item(An object having an entry "gutenberg_id" in the results of the method get_metadata().): The metadata of these tools must contain an entry called: "gutenberg_id"
Returns:
list : the query results
TODO: fix this! One query should be sufficent.
"""
query = self._namespace
query += """ SELECT DISTINCT *
WHERE {
"""
query_results = []
if isinstance(item, GutenbergBook):
book_id = item.get_book_id()
for metadata in self._book_metadata:
query_results.append(self._get_query_results(query + metadata % book_id + "}"))
return query_results
if isinstance(item, GutenbergAuthor):
author_id = item.get_gutenberg_id()
for metadata in self._author_metadata:
query_results.append(self._get_query_results(query + metadata % author_id + "}"))
return query_results
def statistics(self):
"""Print information about the Gutenberg catalog.
Args:
Returns:
str: Formatted string of different statistics. Subject counts sub-subjects too.
Notes:
This method is relatively slow due to the fact that it inspects the whole Gutenberg RDF.
Examples:
>>> gutenberg_data.statistics()
# number of books : 60101
# number of authors : 20908
# number of bookshelves : 335
# number of subjects : 17524
"""
statistics = dict()
statistics["number_of_books"] = self.all_books(select="SELECT (COUNT(DISTINCT ?title) as ?total)")
statistics["number_of_authors"] = self.all_authors(select="SELECT (COUNT(DISTINCT ?author) as ?total)")
statistics["number_of_bookshelves"] = self.all_bookshelves(
select="SELECT (COUNT(DISTINCT ?bookshelf) as ?total)")
statistics["number_of_subjects"] = self.all_subjects(select="SELECT (COUNT(DISTINCT ?subject) as ?total)")
text = ""
for key, value in statistics.items():
text += f"\n{key.replace('_', ' '):23}:\t {value:>5}"
return text
__init__(self, sparql_endpoint)
special
Initialize tools with the SPARQL endpoint, such as an local instance of the Apache Jena Fuseki server.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
sparql_endpoint |
str) |
URL of the triplet store containing Gutenberg Catalog triplets. |
required |
Source code in dhtk/data_sources/gutenberg/api/data.py
def __init__(self, sparql_endpoint):
"""Initialize tools with the SPARQL endpoint,
such as an local instance of the Apache Jena Fuseki server.
Args:
sparql_endpoint (str) : URL of the triplet store containing Gutenberg Catalog triplets.
"""
try:
self._sparql_endpoint = SPARQLWrapper(sparql_endpoint)
except Exception as error:
raise EnvironmentError(
f"Check the sparql_endpoint you provided!: {sparql_endpoint}"
) from error
logger.info(f"GUTENBERG: GutenbergData instantiated using SPARQL endpoint: {sparql_endpoint}")
all_authors(self, select='SELECT DISTINCT ?author')
Return all authors in the store.
This method don't use the standard SPARQL query, but a specific to get only authors.
Args: select (str): The SPARQL query (Default value = "SELECT DISTINCT ?author")
!!! returns list : the query results
Source code in dhtk/data_sources/gutenberg/api/data.py
def all_authors(self, select="SELECT DISTINCT ?author"):
"""Return all authors in the store.
This method don't use the standard SPARQL query, but a specific to get only authors.
Args:
select (str): The SPARQL query (Default value = "SELECT DISTINCT ?author")
Returns:
list : the query results
"""
pattern = f"""
WHERE {{
{self._authors}
}}
ORDER BY ?author
"""
query = self._namespace + select + pattern
if "COUNT" in select:
return self._get_query_results(query)[0]["total"]
return [result["author"] for result in self._get_query_results(query)]
all_books(self, select='SELECT DISTINCT ?title ?author')
Return the title of all books in the store.
This method don't use the standard query, but a specific to get only titles and book identifiers.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
select |
(Default value = "SELECT DISTINCT ?title ?author") |
'SELECT DISTINCT ?title ?author' |
Returns:
Type | Description |
---|---|
list |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def all_books(self, select="SELECT DISTINCT ?title ?author"):
"""Return the title of all books in the store.
This method don't use the standard query, but a specific to get only
titles and book identifiers.
Args:
select: (Default value = "SELECT DISTINCT ?title ?author")
Returns:
list : the query results
"""
pattern = f"""
WHERE {{
{self._books}
}}
ORDER BY ?title
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
if "COUNT" in select:
return query[0]["total"]
return [{"title": result['title'], "author": result['author']} for result in query]
all_bookshelves(self, select='SELECT DISTINCT ?bookshelf')
Return all bookshelves in the store.
This method doesn't use the standard SPARQL query, but a specific to get only bookshelves.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
select |
str) |
SPARQL query (Default value = "SELECT DISTINCT ?bookshelf") |
'SELECT DISTINCT ?bookshelf' |
Returns:
Type | Description |
---|---|
list |
the results for the query |
Source code in dhtk/data_sources/gutenberg/api/data.py
def all_bookshelves(self, select="SELECT DISTINCT ?bookshelf"):
"""Return all bookshelves in the store.
This method doesn't use the standard SPARQL query, but a specific to get only bookshelves.
Args:
select (str) : SPARQL query (Default value = "SELECT DISTINCT ?bookshelf")
Returns:
list : the results for the query
"""
pattern = f"""
WHERE {{
{self._shelves}
}}
ORDER BY ?bookshelf
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
if "COUNT" in select:
return query[0]["total"]
return [result["bookshelf"] for result in query]
all_subjects(self, select='SELECT DISTINCT ?subject')
Return all subjects in the store.
This method don't use the standard SPARQL query, but a specific to get only subjects.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
select |
str |
The SPARQL query (Default value = "SELECT DISTINCT ?subject") |
'SELECT DISTINCT ?subject' |
Returns:
Type | Description |
---|---|
list |
a list with the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def all_subjects(self, select="SELECT DISTINCT ?subject"):
"""Return all subjects in the store.
This method don't use the standard SPARQL query, but a specific to get only subjects.
Args:
select (str): The SPARQL query (Default value = "SELECT DISTINCT ?subject")
Returns:
list : a list with the query results
"""
pattern = f"""
WHERE {{
{self._subjects}
}}
ORDER BY ?subject
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
if "COUNT" in select:
return query[0]["total"]
return [result["subject"] for result in query]
bookshelves_subjects(self, book_id)
Return the bookshelves and the subjects of the given book, designated by his identifier.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book_id(str) |
A Gutenberg book identifier. Is a URI, like "http://www.gutenberg.org/ebooks/10053" |
required |
Returns:
Type | Description |
---|---|
dict |
the query results with 'bookshelves' and 'subjects' |
Source code in dhtk/data_sources/gutenberg/api/data.py
def bookshelves_subjects(self, book_id):
"""Return the bookshelves and the subjects of the given book, designated by his identifier.
Args:
book_id(str): A Gutenberg book identifier. Is a URI, like "http://www.gutenberg.org/ebooks/10053"
Returns:
dict : the query results with 'bookshelves' and 'subjects'
"""
select = "SELECT DISTINCT ?subject ?bookshelf"
pattern = f"""
WHERE {{
<{book_id}> dcterms:subject [dcterms:title ?subject];
pgterms:bookshelf [dcterms:title ?bookshelf] ;
dcterms:type dcmitype:Text.
}}
ORDER BY ?subject
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)
subjects = [result["subject"] for result in query]
bookshelves = [result["bookshelf"] for result in query]
return {"bookshelves": set(bookshelves), "subjects": set(subjects)}
get_author(self, id)
Create an author object with information collected from the Gutenberg Store.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
id(str) |
The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408' |
required |
Returns:
Type | Description |
---|---|
str |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def get_author(self, id):
"""Create an author object with information collected from the Gutenberg Store.
Args:
id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'
Returns:
str : the query results
"""
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
<{id}> pgterms:name ?name .
}}
"""
query = self._namespace + select + pattern
query = self._get_query_results(query)[0]
author = GutenbergAuthor(
gutenberg_id=id,
name=query["name"],
)
author.update_metadata(self.get_metadata(author))
author.update_metadata({"bibliography": self.get_bibliography(id)})
return author
get_bibliography(self, id)
To get all books written by an author.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
id(str) |
The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408' |
required |
Returns:
Type | Description |
---|---|
list |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def get_bibliography(self, id):
"""To get all books written by an author.
Args:
id(str): The author identifier is an URI, like 'http://www.gutenberg.org/2009/agents/408'
Returns:
list : the query results
"""
select = "SELECT DISTINCT ?title"
pattern = f"""
WHERE {{
?book_id purl:creator <{id}> .
{self._books}
}}
ORDER BY ?title
"""
query = self._namespace + select + pattern
return [result["title"] for result in self._get_query_results(query)]
get_book(self, book_id, author=None)
Create a book object with information collected from the Gutenberg Store.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
book_id(str) |
The book identifier is a URI, like 'http://www.gutenberg.org/ebooks/20063' |
required | |
author |
(Default value = None) |
None |
Returns:
Type | Description |
---|---|
gutenberg.tools.book |
A book object |
Source code in dhtk/data_sources/gutenberg/api/data.py
def get_book(self, book_id, author=None):
"""Create a book object with information collected from the Gutenberg Store.
Args:
book_id(str): The book identifier is a URI, like 'http://www.gutenberg.org/ebooks/20063'
author: (Default value = None)
Returns:
gutenberg.tools.book : A book object
"""
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
<{book_id}> purl:title ?title;
purl:creator ?author_id;
dcterms:type dcmitype:Text.
}}"""
query = self._namespace + select + pattern
book = self._get_query_results(query)[0]
if not author:
author = self.get_author(book["author_id"])
metadata = self.bookshelves_subjects(book_id)
book = GutenbergBook(
gutenberg_id=book_id,
title=book["title"],
subject=metadata["subjects"] or None,
bookshelf=metadata["bookshelves"] or None,
author=author
)
book.update_metadata(self.get_metadata(book))
return book
get_metadata(self, item)
Get metadata about the book that is present in the catalog.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item(An |
object having an entry "gutenberg_id" in the results of the method get_metadata(). |
The metadata of these tools must contain an entry called: "gutenberg_id" |
required |
Returns:
Type | Description |
---|---|
dict |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def get_metadata(self, item):
"""Get metadata about the book that is present in the catalog.
Args:
item(An object having an entry "gutenberg_id" in the results of the method get_metadata().): The metadata of these tools must contain an entry called: "gutenberg_id"
Returns:
dict : the query results
"""
query_results = self._metadata_query(item)
metadata = dict()
for result in query_results:
result_count = len(result)
if result_count == 1:
for key in result[0].keys():
metadata[key] = result[0][key]
elif result_count > 1:
for key in result[0].keys():
metadata[key] = [entry[key] for entry in result]
return metadata
search_by_author(self, name, alias=None)
Data books in the Gutenberg catalog by author's name and last name. The standard SPARQL query is overwritten by a filter and by a sort instruction.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name(str) |
Author's name, first name, last name or alias in plain text, case-insensitive. It can be a part of the author's name. (Default value = None) |
required |
Returns:
Type | Description |
---|---|
str |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_author(self, name, alias=None):
"""
Data books in the Gutenberg catalog by author's name and last name. The standard SPARQL query is overwritten by a filter and by a sort instruction.
Args:
name(str): Author's name, first name, last name or alias in plain text, case-insensitive. It can be a part of the author's name. (Default value = None)
Returns:
str : the query results
"""
# If alias is provided, looks for name AND alias
# Otherwise assume the name might be an alias
names = re.split(r'\W+', name)
filter = ""
for name in names:
filter += f'FILTER (CONTAINS(lcase(str(?author)), "{name.lower()}")'
if alias is None:
filter += f' || CONTAINS(lcase(str(?aliases)), "{name.lower()}")'
filter += ")\n"
if alias is not None:
aliases = re.split(r'\W+', alias)
for alias in aliases:
filter += f'FILTER (CONTAINS(lcase(str(?aliases)), "{alias.lower()}"))\n'
select = "SELECT DISTINCT ?author ?aliases ?author_id"
pattern = f"""
WHERE {{
{self._authors}
{filter}
OPTIONAL {{ ?author_id pgterms:alias ?aliases. }}
}}
ORDER BY ?author
"""
query = self._namespace + select + pattern
return self._get_query_results(query)
search_by_bookshelf(self, bookshelf)
Data in Gutenberg catalog all books corresponding to the given bookshelf string.
The bookshelf is used as parameter in a SPARQL query.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
bookshelf(str) |
bookshelf in plain text, case-insensitive. It can be a part of the bookshelf. |
required |
Returns:
Type | Description |
---|---|
str |
result of the query |
Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_bookshelf(self, bookshelf):
"""Data in Gutenberg catalog all books corresponding to the given bookshelf string.
The bookshelf is used as parameter in a SPARQL query.
Args:
bookshelf(str): bookshelf in plain text, case-insensitive. It can be a part of the bookshelf.
Returns:
str: result of the query
"""
logger.info(f"GUTENBERG: Searching bookshelf: {bookshelf}")
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
{self._shelves}
FILTER CONTAINS(lcase(str(?bookshelf)), "{bookshelf.lower()}")
{self._books}
OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}
}}
ORDER BY ?author ?title
"""
query = self._namespace + select + pattern
return self._get_query_results(query)
search_by_subject(self, subject, limit=0)
Data in Gutenberg catalog all books with given subject string.
The subject is used as parameter in a SPARQL query. If no limit value is specified, the method returns all books with the given subject.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
subject(str) |
Subject in plain text, case-insensitive. It can be a part of the subject. |
required | |
limit(int, |
optional |
Use to limit how many books are returned by the SPARQL query. (Default value = 0) |
required |
Returns:
Type | Description |
---|---|
str |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_subject(self, subject, limit=0):
"""Data in Gutenberg catalog all books with given subject string.
The subject is used as parameter in a SPARQL query. If no limit value is specified,
the method returns all books with the given subject.
Args:
subject(str): Subject in plain text, case-insensitive. It can be a part of the subject.
limit(int, optional): Use to limit how many books are returned by the SPARQL query. (Default value = 0)
Returns:
str : the query results
"""
select = "SELECT DISTINCT *"
pattern = f"""
WHERE {{
{self._subjects}
FILTER CONTAINS(lcase(str(?subject)), "{subject.lower()}")
{self._books}
OPTIONAL {{ ?book_id dcterms:language [rdf:value ?language].}}
}}
ORDER BY ?author ?title
"""
query = self._namespace + select + pattern
if limit > 0:
query += f"LIMIT {limit}"
return self._get_query_results(query)
search_by_title(self, title)
Data in Gutenberg catalog all books with given title string.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
title(str) |
Title in plain text, case-insensitive. It can be a part of the title. |
required |
Returns:
Type | Description |
---|---|
str |
the query results |
Source code in dhtk/data_sources/gutenberg/api/data.py
def search_by_title(self, title):
"""Data in Gutenberg catalog all books with given title string.
Args:
title(str): Title in plain text, case-insensitive. It can be a part of the title.
Returns:
str : the query results
"""
select = "SELECT DISTINCT ?book_id ?title ?author_id ?author ?language"
pattern = f"""
WHERE {{
{self._books}
OPTIONAL {{?book_id dcterms:language [rdf:value ?language].}}
FILTER CONTAINS(lcase(str(?title)), {repr(title).lower()})
}}
ORDER BY ?author ?title
"""
query = self._namespace + select + pattern
return self._get_query_results(query)
statistics(self)
Print information about the Gutenberg catalog.
Returns:
Type | Description |
---|---|
str |
Formatted string of different statistics. Subject counts sub-subjects too. |
!!! notes This method is relatively slow due to the fact that it inspects the whole Gutenberg RDF.
Examples:
>>> gutenberg_data.statistics()
# number of books : 60101
# number of authors : 20908
# number of bookshelves : 335
# number of subjects : 17524
Source code in dhtk/data_sources/gutenberg/api/data.py
def statistics(self):
"""Print information about the Gutenberg catalog.
Args:
Returns:
str: Formatted string of different statistics. Subject counts sub-subjects too.
Notes:
This method is relatively slow due to the fact that it inspects the whole Gutenberg RDF.
Examples:
>>> gutenberg_data.statistics()
# number of books : 60101
# number of authors : 20908
# number of bookshelves : 335
# number of subjects : 17524
"""
statistics = dict()
statistics["number_of_books"] = self.all_books(select="SELECT (COUNT(DISTINCT ?title) as ?total)")
statistics["number_of_authors"] = self.all_authors(select="SELECT (COUNT(DISTINCT ?author) as ?total)")
statistics["number_of_bookshelves"] = self.all_bookshelves(
select="SELECT (COUNT(DISTINCT ?bookshelf) as ?total)")
statistics["number_of_subjects"] = self.all_subjects(select="SELECT (COUNT(DISTINCT ?subject) as ?total)")
text = ""
for key, value in statistics.items():
text += f"\n{key.replace('_', ' '):23}:\t {value:>5}"
return text
data
special
builder
get_metadata(self)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
self |
required |
Source code in dhtk/data_sources/gutenberg/data/builder.py
def get_metadata(self):
"""
Args:
self:
Returns:
"""
self.MANAGERS.SYSTEM.download_files(self.data_file,
configuration={"file": "downloads", "setting": "gutenberg"},
desc="Downloading Gutenberg.org metadata")
self.graph, log = self.MANAGERS.SYSTEM.unpack_archive(g=self.graph,
configuration={"file": "gutenberg", "setting": "books"},
desc="Unpacking Gutenberg.org metadata",
archive_type="r:bz2")
self._process_graph(log)
self.MANAGERS.SYSTEM.serialize(self.graph, len(self.MANAGERS.SYSTEM.unpacked_files), file_name=self.master)
owl_test()
Trying to automatically extract class information from triples
check: https://gitlab.com/neves.ces/bnf_project/-/blob/d3d2d1b40d7c6243481c4bf97bc48184e358d905/Gutenberg_process.ipynb
Source code in dhtk/data_sources/gutenberg/data/builder.py
def owl_test():
"""Trying to automatically extract class information from triples
check:
https://gitlab.com/neves.ces/bnf_project/-/blob/d3d2d1b40d7c6243481c4bf97bc48184e358d905/Gutenberg_process.ipynb
Args:
Returns:
"""
import owlready2
onto = owlready2.get_ontology(base_iri="master.rdf")
onto._load()
get_ontology("master.rdf#")
with onto:
class Drug(owlready2.Thing):
""" """
pass
# Class constructors
def constructor(self, arg):
"""
Args:
arg:
Returns:
"""
self.constructor_arg = arg
def define_class(name, arg):
"""
Args:
name:
arg:
Returns:
"""
with onto:
type(name, (owlready2.Thing,), {"__init__": constructor})
define_class("test", 1)
list(onto.classes())
serialize(self, graph, n_files, file_name='master.rdf')
Parameters:
Name | Type | Description | Default |
---|---|---|---|
graph |
required | ||
n_files |
required | ||
file_name |
(Default value = "master.rdf") |
'master.rdf' |
Source code in dhtk/data_sources/gutenberg/data/builder.py
def serialize(self, graph, n_files, file_name="master.rdf"):
"""
Args:
graph:
n_files:
file_name: (Default value = "master.rdf")
Returns:
"""
print("{} triples merged from {} RDF graphs".format(len(graph), n_files))
graph.serialize(destination=str(self.wd / file_name), format='xml', encoding="utf-8")
print("Saved to {}".format(self.wd / file_name))
unpack_archive(self, g=<Graph identifier=N706aa1ef48b34bbaa288f4a3b27fad93 (<class 'rdflib.graph.Graph'>)>, configuration=None, desc='Unpacking archive', archive_type='r:bz2')
Parameters:
Name | Type | Description | Default |
---|---|---|---|
g |
rdflib.Graph |
The graph where unpack (Default value = rdflib.Graph()) |
<Graph identifier=N706aa1ef48b34bbaa288f4a3b27fad93 (<class 'rdflib.graph.Graph'>)> |
configuration |
str |
Configuration settings (Default value = None) |
None |
desc |
str |
The description of the work (Default value = "Unpacking archive") |
'Unpacking archive' |
archive_type |
str |
the suffix extension (Default value = "r:bz2") |
'r:bz2' |
Returns:
Type | Description |
---|---|
g (rdflib.Graph) |
The graph log ( logging): The log file |
Source code in dhtk/data_sources/gutenberg/data/builder.py
def unpack_archive(self, g=rdflib.Graph(), configuration=None, desc="Unpacking archive", archive_type="r:bz2"):
"""
Args:
g (rdflib.Graph): The graph where unpack (Default value = rdflib.Graph())
configuration (str): Configuration settings (Default value = None)
desc (str): The description of the work (Default value = "Unpacking archive")
archive_type (str): the suffix extension (Default value = "r:bz2")
Returns:
g (rdflib.Graph): The graph
log ( logging): The log file
"""
if configuration is None:
configuration = {"file": "downloads", "setting": "files"}
# Capture rdflib.graph.parse logs to identify broken URIs
logger = logging.getLogger("rdflib.term")
log_capture_string = io.StringIO()
ch = logging.StreamHandler(log_capture_string)
logger.addHandler(ch)
file_names = []
for archive in self.last_download:
print("Opening {}".format(archive))
tar = tarfile.open(archive, archive_type)
files = tar.getmembers()
for member in tqdm.tqdm(files, desc=desc):
member.name = os.path.basename(member.name) # Cleans the folder structure
tar.extract(member, self.wd)
g = self._merge_graphs(self.wd / member.name, g)
file_names.append(member.name)
tar.close()
print("Removing {} ".format(archive))
subprocess.check_call(["rm", archive])
log = log_capture_string.getvalue()
# Save to config
self.MANAGERS.CONFIG.add_record(configuration, value=file_names, unique=True)
self.unpacked_files = file_names
return g, log
tools
special
author
Contains the GutenbergAuthor Class.
GutenbergAuthor (Author)
The class for an author extended with variables from the gutenberg project. Child of the generic Author class.
Example
from pprint import pprint
from dhtk.data_sources.gutenberg.author import GutenbergAuthor
# Create an author manually.
adam_smith_author = GutenbergAuthor(
gutenberg_id='http://www.gutenberg.org/2009/agents/1158',
name='Smith, Adam'
)
# create an author form the gutenberg repository
from dhtk.data_sources.gutenberg.data import GutenbergData
database = GutenbergData()
database.search_author_by_name("Adam", "Smith")
# [('Smith, Adam', 'http://www.gutenberg.org/2009/agents/1158'),
# ('Smith, George Adam', 'http://www.gutenberg.org/2009/agents/5016')]
adam_smith_author_pg = database.author_from_author_id(
'http://www.gutenberg.org/2009/agents/1158'
)
adam_smith_author_pg.print_info()
# Adam Smith
# Metadata :
# - gutenberg_id: http://www.gutenberg.org/2009/agents/1158
# - id : http://www.gutenberg.org/2009/agents/1158
# - gutenberg_name: Smith, Adam
# - gutenberg_aliases:
# - aliases :
# - web_pages :
# - http://en.wikipedia.org/wiki/Adam_Smith
# - birth_date : 1723
# - death_date : 1790
Source code in dhtk/data_sources/gutenberg/tools/author.py
class GutenbergAuthor(Author):
"""The class for an author extended with variables from the gutenberg project.
Child of the generic Author class.
Example
--------
from pprint import pprint
from dhtk.data_sources.gutenberg.author import GutenbergAuthor
# Create an author manually.
adam_smith_author = GutenbergAuthor(
gutenberg_id='http://www.gutenberg.org/2009/agents/1158',
name='Smith, Adam'
)
# create an author form the gutenberg repository
from dhtk.data_sources.gutenberg.data import GutenbergData
database = GutenbergData()
database.search_author_by_name("Adam", "Smith")
# [('Smith, Adam', 'http://www.gutenberg.org/2009/agents/1158'),
# ('Smith, George Adam', 'http://www.gutenberg.org/2009/agents/5016')]
adam_smith_author_pg = database.author_from_author_id(
'http://www.gutenberg.org/2009/agents/1158'
)
adam_smith_author_pg.print_info()
# Adam Smith
# Metadata :
# - gutenberg_id: http://www.gutenberg.org/2009/agents/1158
# - id : http://www.gutenberg.org/2009/agents/1158
# - gutenberg_name: Smith, Adam
# - gutenberg_aliases:
# - aliases :
# - web_pages :
# - http://en.wikipedia.org/wiki/Adam_Smith
# - birth_date : 1723
# - death_date : 1790
Args:
Returns:
Args:
Returns:
"""
def __init__(self, gutenberg_id, name, aliases=None, web_pages=None, same_as=None, **kwargs):
"""
Init function of GutenbergAuthor.
Notes:
Implement the Abstract Author class and extend it with the gutenberg id, the eventual alias(es) and the eventual web page(s) of the author. Converts name into a HumanName object with convert_name.
Parameters:
gutenberg_id (str) : URI of the gutenberg author in the Gutenberg RDF.
name (str) : Name of the author.
aliases set(str): Eventual aliases of the author.default None
web_pages set(str): Eventual web pages of the author. default None
"""
if same_as is None:
same_as = {}
id_format = re.compile(r"http://www.gutenberg.org/2009/agents/\d+$")
if not id_format.fullmatch(gutenberg_id):
raise ReferenceError(f"This gutenberg id is not valid: {gutenberg_id}")
self.metadata["gutenberg_id"] = gutenberg_id
self.metadata["id"] = gutenberg_id
self.metadata["gutenberg_name"] = name
if not isinstance(aliases, set):
aliases = set()
self.metadata["aliases"] = aliases
if not isinstance(web_pages, set):
web_pages = set()
self.metadata["web_pages"] = web_pages
# Add saint to nameparser's FIRST_NAME_TITLES
FIRST_NAME_TITLES.add("saint")
name = self.convert_name(name)
# LOGGER.debug("converting aliases names: %s", ", ".join(aliases))
self.metadata["aliases"] = {str(self.convert_name(alias)) for alias in aliases}
# LOGGER.debug("aliases: %s", ", ".join(self.metadata["aliases"]))
if str(name) in self.metadata["aliases"]:
# LOGGER.debug("removing '%s' from %s", str(name), ", ".join(self.metadata["aliases"]))
self.metadata["aliases"].remove(str(name))
metadata = copy.copy(self.metadata)
super().__init__(
name,
same_as=same_as,
metadata=metadata,
**kwargs
)
def convert_name(self, human_name):
"""Convert human_name string containing into a HumanName object.
Args:
human_name(str): Author's name in a string.
Returns:
Notes:
Is done in the init for every GutenbergAuthor object with initial string parameter "name".
"""
human_name = HumanName(human_name)
if human_name.suffix:
self.metadata["gutenberg_name_suffix"] = human_name.suffix
human_name.suffix = ""
if human_name.nickname:
# LOGGER.debug("%s nickname: %s", str(human_name), human_name.nickname)
no_nickname = copy.copy(human_name)
no_nickname.nickname = ""
first_name_match = re.match(
re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.first, re.UNICODE),
human_name.nickname,
re.UNICODE
)
# LOGGER.debug(
# "%s, %s",
# re.sub(
# r"(([A-Z])[a-z]*[.])", r"\2\\w+",
# human_name.first,
# re.UNICODE
# ),
# human_name.nickname
# )
if first_name_match and len(first_name_match.group(0)) >= len(human_name.first):
human_name.first = first_name_match.group(0)
human_name.nickname = human_name.nickname[len(human_name.first):].strip()
# LOGGER.debug("Adding %s to aliases", str(no_nickname))
self.metadata["aliases"] = {str(no_nickname), }
middle_name_match = re.match(
re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.middle, re.UNICODE),
human_name.nickname,
re.UNICODE
)
# LOGGER.debug(
# "%s, %s",
# re.sub(
# r"(([A-Z])[a-z]*[.])", r"\2\\w+",
# human_name.middle, re.UNICODE
# ),
# human_name.nickname
# )
if middle_name_match and len(middle_name_match.group(0)) >= len(human_name.middle):
human_name.middle = middle_name_match.group(0)
human_name.nickname = human_name.nickname[len(human_name.middle):].strip()
# LOGGER.debug("Adding %s to aliases", str(no_nickname))
self.metadata["aliases"].add(str(no_nickname))
return human_name
def get_gutenberg_id(self):
"""
Get the gutenberg id url of the author.
Returns:
str: the guteneberg id
"""
return self.metadata["gutenberg_id"]
def __eq__(self, other):
"""
Equality function between authors.
Notes:
Test the equality of the two authors. Using the gutenberg_id if other is an instance of
GutenbergAuthor. If not, it uses the dhtk.common.author.Author.__eq__() method that uses the
author's names (first, last) and its birthdate.
Args:
other (dhtk.common.author.Author): An instance of dhtk.common.author.Author or cof its child-classes.
Returns:
equality (bool) : A bool that tells if the authors are the same or not.
"""
if isinstance(other, GutenbergAuthor):
equals = self.get_gutenberg_id() == other.get_gutenberg_id()
else:
equals = super().__eq__(other)
return equals
def __hash__(self):
"""
Return hash for the author.
Returns:
hash (int) : The hash value for the author.
"""
return hash((self.get_first_name() + self.get_last_name() + self.get_birth_date()))
def __repr__(self):
"""
Returns:
object_str (str) : String representing the object
"""
return f"<Author: {self.get_last_name()}, {self.get_first_name()}" \
f"({self.get_gutenberg_id()}>)"
__eq__(self, other)
special
Equality function between authors.
!!! notes Test the equality of the two authors. Using the gutenberg_id if other is an instance of GutenbergAuthor. If not, it uses the dhtk.common.author.Author.eq() method that uses the author's names (first, last) and its birthdate.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
other |
dhtk.common.author.Author |
An instance of dhtk.common.author.Author or cof its child-classes. |
required |
Returns:
Type | Description |
---|---|
equality (bool) |
A bool that tells if the authors are the same or not. |
Source code in dhtk/data_sources/gutenberg/tools/author.py
def __eq__(self, other):
"""
Equality function between authors.
Notes:
Test the equality of the two authors. Using the gutenberg_id if other is an instance of
GutenbergAuthor. If not, it uses the dhtk.common.author.Author.__eq__() method that uses the
author's names (first, last) and its birthdate.
Args:
other (dhtk.common.author.Author): An instance of dhtk.common.author.Author or cof its child-classes.
Returns:
equality (bool) : A bool that tells if the authors are the same or not.
"""
if isinstance(other, GutenbergAuthor):
equals = self.get_gutenberg_id() == other.get_gutenberg_id()
else:
equals = super().__eq__(other)
return equals
__hash__(self)
special
Return hash for the author.
Returns:
Type | Description |
---|---|
hash (int) |
The hash value for the author. |
Source code in dhtk/data_sources/gutenberg/tools/author.py
def __hash__(self):
"""
Return hash for the author.
Returns:
hash (int) : The hash value for the author.
"""
return hash((self.get_first_name() + self.get_last_name() + self.get_birth_date()))
__init__(self, gutenberg_id, name, aliases=None, web_pages=None, same_as=None, **kwargs)
special
Init function of GutenbergAuthor.
!!! notes Implement the Abstract Author class and extend it with the gutenberg id, the eventual alias(es) and the eventual web page(s) of the author. Converts name into a HumanName object with convert_name.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gutenberg_id |
str) |
URI of the gutenberg author in the Gutenberg RDF. |
required |
name |
str) |
Name of the author. |
required |
aliases |
set(str |
Eventual aliases of the author.default None |
None |
web_pages |
set(str |
Eventual web pages of the author. default None |
None |
Source code in dhtk/data_sources/gutenberg/tools/author.py
def __init__(self, gutenberg_id, name, aliases=None, web_pages=None, same_as=None, **kwargs):
"""
Init function of GutenbergAuthor.
Notes:
Implement the Abstract Author class and extend it with the gutenberg id, the eventual alias(es) and the eventual web page(s) of the author. Converts name into a HumanName object with convert_name.
Parameters:
gutenberg_id (str) : URI of the gutenberg author in the Gutenberg RDF.
name (str) : Name of the author.
aliases set(str): Eventual aliases of the author.default None
web_pages set(str): Eventual web pages of the author. default None
"""
if same_as is None:
same_as = {}
id_format = re.compile(r"http://www.gutenberg.org/2009/agents/\d+$")
if not id_format.fullmatch(gutenberg_id):
raise ReferenceError(f"This gutenberg id is not valid: {gutenberg_id}")
self.metadata["gutenberg_id"] = gutenberg_id
self.metadata["id"] = gutenberg_id
self.metadata["gutenberg_name"] = name
if not isinstance(aliases, set):
aliases = set()
self.metadata["aliases"] = aliases
if not isinstance(web_pages, set):
web_pages = set()
self.metadata["web_pages"] = web_pages
# Add saint to nameparser's FIRST_NAME_TITLES
FIRST_NAME_TITLES.add("saint")
name = self.convert_name(name)
# LOGGER.debug("converting aliases names: %s", ", ".join(aliases))
self.metadata["aliases"] = {str(self.convert_name(alias)) for alias in aliases}
# LOGGER.debug("aliases: %s", ", ".join(self.metadata["aliases"]))
if str(name) in self.metadata["aliases"]:
# LOGGER.debug("removing '%s' from %s", str(name), ", ".join(self.metadata["aliases"]))
self.metadata["aliases"].remove(str(name))
metadata = copy.copy(self.metadata)
super().__init__(
name,
same_as=same_as,
metadata=metadata,
**kwargs
)
__repr__(self)
special
Returns:
Type | Description |
---|---|
object_str (str) |
String representing the object |
Source code in dhtk/data_sources/gutenberg/tools/author.py
def __repr__(self):
"""
Returns:
object_str (str) : String representing the object
"""
return f"<Author: {self.get_last_name()}, {self.get_first_name()}" \
f"({self.get_gutenberg_id()}>)"
convert_name(self, human_name)
Convert human_name string containing into a HumanName object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
human_name(str) |
Author's name in a string. |
required |
Returns:
!!! notes Is done in the init for every GutenbergAuthor object with initial string parameter "name".
Source code in dhtk/data_sources/gutenberg/tools/author.py
def convert_name(self, human_name):
"""Convert human_name string containing into a HumanName object.
Args:
human_name(str): Author's name in a string.
Returns:
Notes:
Is done in the init for every GutenbergAuthor object with initial string parameter "name".
"""
human_name = HumanName(human_name)
if human_name.suffix:
self.metadata["gutenberg_name_suffix"] = human_name.suffix
human_name.suffix = ""
if human_name.nickname:
# LOGGER.debug("%s nickname: %s", str(human_name), human_name.nickname)
no_nickname = copy.copy(human_name)
no_nickname.nickname = ""
first_name_match = re.match(
re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.first, re.UNICODE),
human_name.nickname,
re.UNICODE
)
# LOGGER.debug(
# "%s, %s",
# re.sub(
# r"(([A-Z])[a-z]*[.])", r"\2\\w+",
# human_name.first,
# re.UNICODE
# ),
# human_name.nickname
# )
if first_name_match and len(first_name_match.group(0)) >= len(human_name.first):
human_name.first = first_name_match.group(0)
human_name.nickname = human_name.nickname[len(human_name.first):].strip()
# LOGGER.debug("Adding %s to aliases", str(no_nickname))
self.metadata["aliases"] = {str(no_nickname), }
middle_name_match = re.match(
re.sub(r"(([A-Z])[a-z]*[.])", r"\2\\w+", human_name.middle, re.UNICODE),
human_name.nickname,
re.UNICODE
)
# LOGGER.debug(
# "%s, %s",
# re.sub(
# r"(([A-Z])[a-z]*[.])", r"\2\\w+",
# human_name.middle, re.UNICODE
# ),
# human_name.nickname
# )
if middle_name_match and len(middle_name_match.group(0)) >= len(human_name.middle):
human_name.middle = middle_name_match.group(0)
human_name.nickname = human_name.nickname[len(human_name.middle):].strip()
# LOGGER.debug("Adding %s to aliases", str(no_nickname))
self.metadata["aliases"].add(str(no_nickname))
return human_name
get_gutenberg_id(self)
Get the gutenberg id url of the author.
Returns:
Type | Description |
---|---|
str |
the guteneberg id |
Source code in dhtk/data_sources/gutenberg/tools/author.py
def get_gutenberg_id(self):
"""
Get the gutenberg id url of the author.
Returns:
str: the guteneberg id
"""
return self.metadata["gutenberg_id"]
book
Contains the GutenbergBook Class.
Returns a Book object which is extended to contain parameters for gutenberg id
GutenbergBook (Book)
Extends the Book class for Project Gutenberg books.
Examples:
>>> from pprint import pprint
>>> from dhtk.data_sources.gutenberg.tools.book import GutenbergBook
>>> from dhtk.data_sources.gutenberg.tools.author import GutenbergAuthor
>>> moby_dick = GutenbergBook(
>>> title="Moby Dick",
>>> gutenberg_id="http://www.gutenberg.org/ebooks/2489",
>>> author=GutenbergAuthor(
>>> gutenberg_id='http://www.gutenberg.org/2009/agents/9',
>>> name='Melville, Herman'
>>> )
>>> )
>>> from dhtk.data_sources.gutenberg.api.data import GutenbergData
>>> gutenberg_search = GutenbergData()
>>> book = gutenberg_search.book_from_book_id("http://www.gutenberg.org/ebooks/2701")
>>> book.print_info()
# Title : Moby Dick; Or, The Whale
# Author : Herman Melville
# Metadata :
# - gutenberg_id: http://www.gutenberg.org/ebooks/2701
Source code in dhtk/data_sources/gutenberg/tools/book.py
class GutenbergBook(Book):
"""Extends the Book class for Project Gutenberg books.
Examples:
>>> from pprint import pprint
>>> from dhtk.data_sources.gutenberg.tools.book import GutenbergBook
>>> from dhtk.data_sources.gutenberg.tools.author import GutenbergAuthor
>>> moby_dick = GutenbergBook(
>>> title="Moby Dick",
>>> gutenberg_id="http://www.gutenberg.org/ebooks/2489",
>>> author=GutenbergAuthor(
>>> gutenberg_id='http://www.gutenberg.org/2009/agents/9',
>>> name='Melville, Herman'
>>> )
>>> )
>>> from dhtk.data_sources.gutenberg.api.data import GutenbergData
>>> gutenberg_search = GutenbergData()
>>> book = gutenberg_search.book_from_book_id("http://www.gutenberg.org/ebooks/2701")
>>> book.print_info()
# Title : Moby Dick; Or, The Whale
# Author : Herman Melville
# Metadata :
# - gutenberg_id: http://www.gutenberg.org/ebooks/2701
"""
def __init__(self, gutenberg_id, title, author, same_as=None, **kwargs):
"""
Init function of the GutenbergBook Class.
Args:
gutenberg_id (str) : Must start with "http://www.gutenberg.org/ebooks/".
author (dhtk.common.author.Author) :The object containing the author of the book. Of type dhtk.common.author.Author or a subclass of it.
title (str) :The title of the book, in format given by Gutenberg.
same_as (dict): A dictionary containing same_as URIs.
**kwargs (dict) : Will be used as metadata.
"""
if same_as is None:
same_as = dict()
id_format = re.compile(r"http://www.gutenberg.org/ebooks/\d+$")
if not id_format.fullmatch(gutenberg_id):
# LOGGER.error("This gutenberg id is not valid! %s", gutenberg_id)
raise ReferenceError("This gutenberg id is not valid! %s" % gutenberg_id)
title = re.sub(r"\s+", " ", title)
super().__init__(title=title, author=author, gutenberg_id=gutenberg_id, same_as=same_as, metadata=kwargs)
def get_book_id(self):
"""
Returns:
"""
return self.metadata.get("gutenberg_id", "")
def get_uri(self):
"""
Returns:
"""
return self.metadata.get("gutenberg_id", "")
def get_book_id_number(self):
"""
Returns:
"""
return self.metadata.get("gutenberg_id", "/").rsplit("/", 1)[1]
def get_text_file_dir_path(self):
"""Return the suffix of the uri of the book in a gutenberg text repository.
Args:
Returns:
str: Returns the suffix of the gutenberg file repository where the file is to be found:
Args:
Returns:
Notes
-----
This method is generally used with::
"file://gutenberg/repository/path/" + self.get_text_file_dir_path() + "-file.extension"
#or
"http://distant.gutenberg-repository.path" + self.get_text_file_dir_path() + "-file.extension"
the "-file.extension" can be -0.txt, .zip, .txt depending on the presence in the repository
and on the file encoding.
Example
-------
print(book.get_text_file_dir_path())
# "2/7/0/2701/2701"
"""
# LOGGER.debug("id: %s", self.metadata.get("gutenberg_id", ""))
gutenberg_id_num = self.get_book_id_number()
if int(gutenberg_id_num) < 10:
subdir = "0/{0}/{0}".format(gutenberg_id_num)
elif int(gutenberg_id_num) < 100:
subdir = "{0}/{1}/{1}".format(gutenberg_id_num[0], gutenberg_id_num)
elif int(gutenberg_id_num) < 1000:
subdir = "{0}/{1}/{2}/{2}".format(
gutenberg_id_num[0],
gutenberg_id_num[1],
gutenberg_id_num
)
else:
gutenberg_id_string = str(gutenberg_id_num).zfill(2)
all_but_last_digit = list(gutenberg_id_string[:-1])
subdir_part = "/".join(all_but_last_digit)
subdir = "{0}/{1}/{1}".format(subdir_part, gutenberg_id_num)
return subdir
def repository(self):
""" """
repo = GutenbergTexts(self)
return repo
def original_text(self):
""" """
text = self.repository().get_original_text()
return text
def __eq__(self, other):
"""
Equality function.
Notes
-----
Test the equality of the two books. Using the gutenberg_id if other is an instance of
GutenbergBook. If not, it uses the dhtk.common.author.Book.__eq__() method that uses the
book's authors and titles.
Parameters
----------
other: an instance from dhtk.data_sources.templates.Book or any child class.
Returns
-------
equality: bool
"""
if isinstance(other, GutenbergBook):
equals = self.get_book_id() == other.get_book_id()
else:
equals = super().__eq__(other)
return equals
def __hash__(self):
"""
Returns hash of attributes of gutenberg book.
Notes
-----
The hash is created from:
- author
- title
- first date (from metadata)
Allows dictionary keys to be compared quickly.
Returns
-------
hash: int
"""
return hash(self._author.get_full_name() + self._title + self.get_first_edition_date())
def __repr__(self):
"""
Returns
-------
object_str : String representing the object
"""
return "<GutenbergBook: %s - %s gutenberg_id: %s>" % (
self.get_author().get_full_name(),
self.get_title(),
self.get_book_id_number()
)
__eq__(self, other)
special
Equality function.
Notes
Test the equality of the two books. Using the gutenberg_id if other is an instance of GutenbergBook. If not, it uses the dhtk.common.author.Book.eq() method that uses the book's authors and titles.
Parameters
other: an instance from dhtk.data_sources.templates.Book or any child class.
Returns
equality: bool
Source code in dhtk/data_sources/gutenberg/tools/book.py
def __eq__(self, other):
"""
Equality function.
Notes
-----
Test the equality of the two books. Using the gutenberg_id if other is an instance of
GutenbergBook. If not, it uses the dhtk.common.author.Book.__eq__() method that uses the
book's authors and titles.
Parameters
----------
other: an instance from dhtk.data_sources.templates.Book or any child class.
Returns
-------
equality: bool
"""
if isinstance(other, GutenbergBook):
equals = self.get_book_id() == other.get_book_id()
else:
equals = super().__eq__(other)
return equals
__hash__(self)
special
Returns hash of attributes of gutenberg book.
Notes
The hash is created from: - author - title - first date (from metadata) Allows dictionary keys to be compared quickly.
Returns
hash: int
Source code in dhtk/data_sources/gutenberg/tools/book.py
def __hash__(self):
"""
Returns hash of attributes of gutenberg book.
Notes
-----
The hash is created from:
- author
- title
- first date (from metadata)
Allows dictionary keys to be compared quickly.
Returns
-------
hash: int
"""
return hash(self._author.get_full_name() + self._title + self.get_first_edition_date())
__init__(self, gutenberg_id, title, author, same_as=None, **kwargs)
special
Init function of the GutenbergBook Class.
gutenberg_id (str) : Must start with "http://www.gutenberg.org/ebooks/".
author (dhtk.common.author.Author) :The object containing the author of the book. Of type dhtk.common.author.Author or a subclass of it.
title (str) :The title of the book, in format given by Gutenberg.
same_as (dict): A dictionary containing same_as URIs.
**kwargs (dict) : Will be used as metadata.
Source code in dhtk/data_sources/gutenberg/tools/book.py
def __init__(self, gutenberg_id, title, author, same_as=None, **kwargs):
"""
Init function of the GutenbergBook Class.
Args:
gutenberg_id (str) : Must start with "http://www.gutenberg.org/ebooks/".
author (dhtk.common.author.Author) :The object containing the author of the book. Of type dhtk.common.author.Author or a subclass of it.
title (str) :The title of the book, in format given by Gutenberg.
same_as (dict): A dictionary containing same_as URIs.
**kwargs (dict) : Will be used as metadata.
"""
if same_as is None:
same_as = dict()
id_format = re.compile(r"http://www.gutenberg.org/ebooks/\d+$")
if not id_format.fullmatch(gutenberg_id):
# LOGGER.error("This gutenberg id is not valid! %s", gutenberg_id)
raise ReferenceError("This gutenberg id is not valid! %s" % gutenberg_id)
title = re.sub(r"\s+", " ", title)
super().__init__(title=title, author=author, gutenberg_id=gutenberg_id, same_as=same_as, metadata=kwargs)
__repr__(self)
special
Returns
object_str : String representing the object
Source code in dhtk/data_sources/gutenberg/tools/book.py
def __repr__(self):
"""
Returns
-------
object_str : String representing the object
"""
return "<GutenbergBook: %s - %s gutenberg_id: %s>" % (
self.get_author().get_full_name(),
self.get_title(),
self.get_book_id_number()
)
get_book_id(self)
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_book_id(self):
"""
Returns:
"""
return self.metadata.get("gutenberg_id", "")
get_book_id_number(self)
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_book_id_number(self):
"""
Returns:
"""
return self.metadata.get("gutenberg_id", "/").rsplit("/", 1)[1]
get_text_file_dir_path(self)
Return the suffix of the uri of the book in a gutenberg text repository.
Returns:
Type | Description |
---|---|
str |
Returns the suffix of the gutenberg file repository where the file is to be found: |
Notes
This method is generally used with::
"file://gutenberg/repository/path/" + self.get_text_file_dir_path() + "-file.extension"
#or
"http://distant.gutenberg-repository.path" + self.get_text_file_dir_path() + "-file.extension"
the "-file.extension" can be -0.txt, .zip, .txt depending on the presence in the repository and on the file encoding.
Example
print(book.get_text_file_dir_path())
# "2/7/0/2701/2701"
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_text_file_dir_path(self):
"""Return the suffix of the uri of the book in a gutenberg text repository.
Args:
Returns:
str: Returns the suffix of the gutenberg file repository where the file is to be found:
Args:
Returns:
Notes
-----
This method is generally used with::
"file://gutenberg/repository/path/" + self.get_text_file_dir_path() + "-file.extension"
#or
"http://distant.gutenberg-repository.path" + self.get_text_file_dir_path() + "-file.extension"
the "-file.extension" can be -0.txt, .zip, .txt depending on the presence in the repository
and on the file encoding.
Example
-------
print(book.get_text_file_dir_path())
# "2/7/0/2701/2701"
"""
# LOGGER.debug("id: %s", self.metadata.get("gutenberg_id", ""))
gutenberg_id_num = self.get_book_id_number()
if int(gutenberg_id_num) < 10:
subdir = "0/{0}/{0}".format(gutenberg_id_num)
elif int(gutenberg_id_num) < 100:
subdir = "{0}/{1}/{1}".format(gutenberg_id_num[0], gutenberg_id_num)
elif int(gutenberg_id_num) < 1000:
subdir = "{0}/{1}/{2}/{2}".format(
gutenberg_id_num[0],
gutenberg_id_num[1],
gutenberg_id_num
)
else:
gutenberg_id_string = str(gutenberg_id_num).zfill(2)
all_but_last_digit = list(gutenberg_id_string[:-1])
subdir_part = "/".join(all_but_last_digit)
subdir = "{0}/{1}/{1}".format(subdir_part, gutenberg_id_num)
return subdir
get_uri(self)
Source code in dhtk/data_sources/gutenberg/tools/book.py
def get_uri(self):
"""
Returns:
"""
return self.metadata.get("gutenberg_id", "")
texts
Contains GutenbergTexts Class.
Notes
This class is being reworked. The cleaning of the texts was adapted from: https://github.com/okfn/gutenizer
GutenbergTexts
Clean up Gutenberg texts by removing all the header and footer bumpf.
Notes
Part of this class have to be reworked.
Usage : init and then run _extract_text. _notes_end = "" _header_end = "" _footer_start = "" _original_text = "" _clean_text = "" _url = ""
Source code in dhtk/data_sources/gutenberg/tools/texts.py
class GutenbergTexts:
"""
Clean up Gutenberg texts by removing all the header and footer bumpf.
Args:
Returns:
Notes
-----
Part of this class have to be reworked.
Usage : init and then run _extract_text.
_notes_end = ""
_header_end = ""
_footer_start = ""
_original_text = ""
_clean_text = ""
_url = ""
"""
def __init__(self, book, repository_uri='http://aleph.gutenberg.org'):
"""
Init function of the GutenbergTexts.
Check repository_uri and create a temporary directory for file operations.
repository_uri: can be local file:/path/to/dir
refer to:
https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
to download the files.
Parameters
------------------
repository_uri : str
Can be a file uri file://home/user/Documents/gutenberg_dump or
a http uri: http://aleph.gutenberg.org
"""
self._original_text = None
if not repository_uri:
raise ValueError("Please set the URI of a 'local' gutenberg text repository.")
if "http://www.gutenberg.org/files" in repository_uri:
raise ValueError(
"""
Please create a local repository. More information on:
https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
"""
)
self._temporary_dir = Path(mkdtemp(prefix="dhtk-"))
self._repository_uri = repository_uri
self.book = book
def get_original_text(self):
"""Returns original text of a given book."""
found_url = False
url = ""
if self._original_text:
return self._original_text
base_url = self._repository_uri + "/" + self.book.get_text_file_dir_path()
valid_extensions = ("-0.txt", "-8.txt", ".txt")
if self._repository_uri.startswith("file://"):
valid_extensions = ("-0.txt", "-8.txt", ".txt", "-0.zip", "-8.zip", ".zip")
for extension in valid_extensions:
url = base_url + extension
try:
found_url = url_exists(url)
except: # aleph is not reliable, just use gutenberg directly for now
url = re.sub(self._repository_uri, "http://www.gutenberg.org/files", url)
id = self.book.get_book_id_number()
url = re.sub(self.book.get_text_file_dir_path(), f"{id}/{id}", url)
found_url = url_exists(url)
if found_url:
break
# TODO: once search does not find audio editions anymore uncomment this:
# if not found_url:
# raise Warning(
# "Could not find the text file for {} {}.".format(
# book.get_author(),
# book.get_title()
# )
# )
# TODO: once search does not find audio anymore editions remove this:
if not found_url:
return None
try:
raw_file_path = download_files(url, self._temporary_dir / self.book.get_text_file_name(), self.book._title)
if raw_file_path.endswith(".zip"):
self._original_text = unarchive_book(raw_file_path)
path = pathlib.Path(raw_file_path)
path.unlink()
else:
with open(raw_file_path, "r", encoding="utf8", errors='ignore') as book_text_file:
self._original_text = book_text_file.read()
path = pathlib.Path(raw_file_path)
path.unlink()
except Exception as ex:
raise ex
return self._original_text
def save_original_text_file_to(self, destination):
"""Save the original text to a text-file in or at destination.
Args:
destination(str): Path of the destination where the text will be saved.
Returns:
"""
destination = pathlib.Path(destination)
filename = self.book.get_text_file_name()
filename = destination / filename
if filename.is_file() and filename.stat().st_size == 0:
return filename
self.get_original_text()
if not destination.is_dir():
destination.mkdir(parents=True, exist_ok=True)
try:
with open(filename, "w", encoding='utf8') as file_writer:
file_writer.write(self._original_text)
except IOError:
# LOGGER.warning("File %s could not be created.", filename)
print("File %s could not be created.", filename)
return filename
# def save_clean_text_file_to(self, destination):
# """Save the clean text to a text-file in or at destination.
#
# Args:
# destination(str): Path of the destination where the text will be saved.
#
# Returns:
#
# """
# self.get_original_text()
#
# destination = Path(destination)
#
# if not destination.is_dir():
# destination.mkdir(parents=True, exist_ok=True)
#
# filename = self.book.get_text_file_name()
#
# filename = destination / filename
# if not filename.is_file() or filename.stat().st_size == 0:
# with open(filename, "w") as file_writer:
# file_writer.write(self._clean_text)
#
# return filename
def __del__(self):
try:
if self._temporary_dir.is_dir():
shutil.rmtree(self._temporary_dir)
except NameError:
pass
def close(self):
"""Remove temporary directory if instance is closed."""
try:
if self._temporary_dir.is_dir():
shutil.rmtree(self._temporary_dir)
except NameError:
pass
__init__(self, book, repository_uri='http://aleph.gutenberg.org')
special
Init function of the GutenbergTexts.
Check repository_uri and create a temporary directory for file operations. repository_uri: can be local file:/path/to/dir refer to: https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages to download the files.
Parameters
repository_uri : str Can be a file uri file://home/user/Documents/gutenberg_dump or a http uri: http://aleph.gutenberg.org
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def __init__(self, book, repository_uri='http://aleph.gutenberg.org'):
"""
Init function of the GutenbergTexts.
Check repository_uri and create a temporary directory for file operations.
repository_uri: can be local file:/path/to/dir
refer to:
https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
to download the files.
Parameters
------------------
repository_uri : str
Can be a file uri file://home/user/Documents/gutenberg_dump or
a http uri: http://aleph.gutenberg.org
"""
self._original_text = None
if not repository_uri:
raise ValueError("Please set the URI of a 'local' gutenberg text repository.")
if "http://www.gutenberg.org/files" in repository_uri:
raise ValueError(
"""
Please create a local repository. More information on:
https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages
"""
)
self._temporary_dir = Path(mkdtemp(prefix="dhtk-"))
self._repository_uri = repository_uri
self.book = book
close(self)
Remove temporary directory if instance is closed.
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def close(self):
"""Remove temporary directory if instance is closed."""
try:
if self._temporary_dir.is_dir():
shutil.rmtree(self._temporary_dir)
except NameError:
pass
get_original_text(self)
Returns original text of a given book.
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def get_original_text(self):
"""Returns original text of a given book."""
found_url = False
url = ""
if self._original_text:
return self._original_text
base_url = self._repository_uri + "/" + self.book.get_text_file_dir_path()
valid_extensions = ("-0.txt", "-8.txt", ".txt")
if self._repository_uri.startswith("file://"):
valid_extensions = ("-0.txt", "-8.txt", ".txt", "-0.zip", "-8.zip", ".zip")
for extension in valid_extensions:
url = base_url + extension
try:
found_url = url_exists(url)
except: # aleph is not reliable, just use gutenberg directly for now
url = re.sub(self._repository_uri, "http://www.gutenberg.org/files", url)
id = self.book.get_book_id_number()
url = re.sub(self.book.get_text_file_dir_path(), f"{id}/{id}", url)
found_url = url_exists(url)
if found_url:
break
# TODO: once search does not find audio editions anymore uncomment this:
# if not found_url:
# raise Warning(
# "Could not find the text file for {} {}.".format(
# book.get_author(),
# book.get_title()
# )
# )
# TODO: once search does not find audio anymore editions remove this:
if not found_url:
return None
try:
raw_file_path = download_files(url, self._temporary_dir / self.book.get_text_file_name(), self.book._title)
if raw_file_path.endswith(".zip"):
self._original_text = unarchive_book(raw_file_path)
path = pathlib.Path(raw_file_path)
path.unlink()
else:
with open(raw_file_path, "r", encoding="utf8", errors='ignore') as book_text_file:
self._original_text = book_text_file.read()
path = pathlib.Path(raw_file_path)
path.unlink()
except Exception as ex:
raise ex
return self._original_text
save_original_text_file_to(self, destination)
Save the original text to a text-file in or at destination.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
destination(str) |
Path of the destination where the text will be saved. |
required |
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def save_original_text_file_to(self, destination):
"""Save the original text to a text-file in or at destination.
Args:
destination(str): Path of the destination where the text will be saved.
Returns:
"""
destination = pathlib.Path(destination)
filename = self.book.get_text_file_name()
filename = destination / filename
if filename.is_file() and filename.stat().st_size == 0:
return filename
self.get_original_text()
if not destination.is_dir():
destination.mkdir(parents=True, exist_ok=True)
try:
with open(filename, "w", encoding='utf8') as file_writer:
file_writer.write(self._original_text)
except IOError:
# LOGGER.warning("File %s could not be created.", filename)
print("File %s could not be created.", filename)
return filename
unarchive_book(path, destination=None)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path(str) |
Path of the archive of a book. A Zip file containing a single txt file. |
required | |
destination(str, |
optional |
Path where the texfile should be extracted. (Default value = None) |
required |
Source code in dhtk/data_sources/gutenberg/tools/texts.py
def unarchive_book(path, destination=None):
"""
Args:
path(str): Path of the archive of a book. A Zip file containing a single txt file.
destination(str, optional): Path where the texfile should be extracted. (Default value = None)
Returns:
"""
title = path.rsplit("/", 1)[1].replace(".zip", "")
archive = zipfile.ZipFile(path, 'r')
raw_text = ""
for txt_file in archive.namelist():
print(title)
if txt_file.endswith(".txt"):
raw_text = archive.read(txt_file)
break
detect = chardet.detect(raw_text)
raw_text = raw_text.decode(detect["encoding"])
if destination:
try:
with open(destination, "w")as out_file:
out_file.write(destination)
except IOError:
# TODO: fix logger this
raise IOError("%s could not be written.", destination)
return raw_text