commit
93175f2c9c
4 changed files with 286 additions and 0 deletions
@ -0,0 +1,28 @@ |
|||
An archive.org dowloader using python and async for maximum performance. |
|||
|
|||
Installation |
|||
------------ |
|||
|
|||
archiveorg-dl requires python >= 3.6 to work: |
|||
|
|||
```bash |
|||
pip install git+https://dev.funkwhale.audio/funkwhale/archiveorg-dl.git |
|||
``` |
|||
|
|||
Usage |
|||
----- |
|||
|
|||
```bash |
|||
# print help |
|||
archiveorg-dl --help |
|||
|
|||
# download all original mp3 and ogg files from the freemusicarchive collection to /destination |
|||
archiveorg-dl download "collection:freemusicarchive AND mediatype:audio" /destination --original -e mp3 -e ogg |
|||
|
|||
``` |
|||
|
|||
Download queries |
|||
---------------- |
|||
|
|||
The `download` command query argument is passed directly to archive.org. Refer |
|||
to [the corresponding documentation to craft your own queries](https://archive.org/advancedsearch.php). |
@ -0,0 +1,206 @@ |
|||
import aiofiles |
|||
import aiohttp |
|||
import asyncio |
|||
import click |
|||
import click_log |
|||
import hashlib |
|||
import logging |
|||
import math |
|||
import os |
|||
import random |
|||
import urllib.parse |
|||
|
|||
logger = logging.getLogger(__name__) |
|||
click_log.basic_config(logger) |
|||
|
|||
|
|||
@click.group() |
|||
def cli(): |
|||
pass |
|||
|
|||
|
|||
@cli.command() |
|||
@click.argument("query") |
|||
@click.argument( |
|||
"download_dir", |
|||
type=click.Path( |
|||
dir_okay=True, exists=True, file_okay=False, writable=True, readable=True |
|||
), |
|||
) |
|||
@click.option( |
|||
"--original/--no-original", default=True, help="Only download original files" |
|||
) |
|||
@click.option("--page-size", "-s", default=100) |
|||
@click.option( |
|||
"--parallel", "-p", default=10, help="Number of parallel downloads allowed" |
|||
) |
|||
@click.option("--extension", "-e", multiple=True, help="Restrict extension to download") |
|||
@click_log.simple_verbosity_option(logger) |
|||
def download(query, download_dir, original, page_size, parallel, extension): |
|||
""" |
|||
Download all files matching given query and formats from Archive.org. |
|||
|
|||
Example query: "collection:freemusicarchive AND mediatype:audio" |
|||
""" |
|||
|
|||
logger.info(f"Starting download for query '{query}'") |
|||
loop = asyncio.get_event_loop() |
|||
loop.run_until_complete( |
|||
handle_download( |
|||
query=query, |
|||
download_dir=download_dir, |
|||
parallel=parallel, |
|||
original=original, |
|||
page_size=page_size, |
|||
allowed_extensions=extension, |
|||
) |
|||
) |
|||
|
|||
|
|||
async def handle_download( |
|||
query, download_dir, original, page_size, parallel, allowed_extensions |
|||
): |
|||
sem = asyncio.Semaphore(parallel) |
|||
conn = aiohttp.TCPConnector() |
|||
async with aiohttp.ClientSession(connector=conn, raise_for_status=True) as session: |
|||
page = 1 |
|||
total_pages = 2 |
|||
url = get_search_url(query, page_size=1, page=1) |
|||
async with session.get(url) as response: |
|||
data = await response.json() |
|||
page_size = page_size |
|||
total = data["response"]["numFound"] |
|||
logger.info(f"Downloading {total} items in {download_dir}...") |
|||
page = 1 |
|||
total_pages = math.ceil(total / page_size) |
|||
results = { |
|||
"total": total, |
|||
"handled": 0, |
|||
"ok": 0, |
|||
"skip": 0, |
|||
"error": 0, |
|||
"size": 0, |
|||
} |
|||
with click.progressbar( |
|||
length=total, label="Downloading items", show_pos=True, show_percent=True |
|||
) as bar: |
|||
while page <= total_pages: |
|||
url = get_search_url(query, page_size=page_size, page=page) |
|||
logger.debug(f"Fetching {url}...") |
|||
async with session.get(url) as response: |
|||
page_data = await response.json() |
|||
|
|||
tasks = asyncio.gather( |
|||
*[ |
|||
download_item( |
|||
download_dir=download_dir, |
|||
item_data=obj, |
|||
results=results, |
|||
bar=bar, |
|||
session=session, |
|||
original=original, |
|||
allowed_extensions=allowed_extensions, |
|||
semaphore=sem, |
|||
) |
|||
for obj in page_data["response"]["docs"] |
|||
] |
|||
) |
|||
await tasks |
|||
page += 1 |
|||
|
|||
|
|||
async def download_item( |
|||
download_dir, |
|||
item_data, |
|||
results, |
|||
bar, |
|||
session, |
|||
original, |
|||
allowed_extensions, |
|||
semaphore, |
|||
): |
|||
async with semaphore: |
|||
files_data = await get_files_data(item_data["identifier"], session) |
|||
to_download = list( |
|||
filter_files( |
|||
files_data["result"], |
|||
original=original, |
|||
allowed_extensions=allowed_extensions, |
|||
) |
|||
) |
|||
item_dir = get_item_dir(download_dir, item_data["identifier"]) |
|||
files_semaphore = asyncio.Semaphore(2) |
|||
for f in to_download: |
|||
url = f"https://archive.org/download/{item_data['identifier']}/{f['name']}" |
|||
path = os.path.join(item_dir, f["name"]) |
|||
if os.path.exists(path) and await check_integrity(path, f["sha1"]): |
|||
logger.debug(f"Skipping already downloaded file at {path}") |
|||
continue |
|||
await download_file(path, url, session, semaphore=files_semaphore) |
|||
# jitter to avoid connection reset by peer from archive.org |
|||
await asyncio.sleep(random.uniform(0.3, 1)) |
|||
|
|||
results["ok"] += 1 |
|||
bar.update(1) |
|||
|
|||
|
|||
def get_item_dir(download_dir, identifier): |
|||
# to avoid having tousands of directories in the same place, we chunk the identifier |
|||
# and append it to the download dir, like that: /tmp/download/id/en/if/identifier |
|||
slug = identifier.replace("-", "").replace("_", "").lower() |
|||
id_chunks = [slug[i : i + 2] for i in range(0, 8, 2)] + [identifier] |
|||
path = os.path.join(download_dir, *id_chunks) |
|||
|
|||
if not os.path.exists(path): |
|||
logger.debug(f"Creating item dir {path}...") |
|||
os.makedirs(path) |
|||
return path |
|||
|
|||
|
|||
async def check_integrity(path, expected_checksum): |
|||
async with aiofiles.open(path, mode="rb") as f: |
|||
hash = hashlib.sha1() |
|||
hash.update(await f.read()) |
|||
|
|||
return expected_checksum == hash.hexdigest() |
|||
|
|||
|
|||
async def get_files_data(identifier, session): |
|||
url = f"https://archive.org/metadata/{identifier}/files" |
|||
logger.debug(f"Fetching files data at {url}...") |
|||
async with session.get(url) as response: |
|||
return await response.json() |
|||
|
|||
|
|||
async def download_file(path, url, session, semaphore): |
|||
async with semaphore: |
|||
logger.debug(f"Downloading file {url}...") |
|||
async with aiofiles.open(path, mode="wb") as f: |
|||
async with session.get(url) as response: |
|||
await f.write(await response.read()) |
|||
|
|||
|
|||
def filter_files(files, original, allowed_extensions): |
|||
for f in files: |
|||
|
|||
if original and f["source"] != "original": |
|||
logger.debug(f"Skipping not original file") |
|||
continue |
|||
|
|||
if allowed_extensions: |
|||
extension = os.path.splitext(f["name"])[-1][1:] |
|||
if extension not in allowed_extensions: |
|||
logger.debug( |
|||
f"Skipping extension {extension} ({allowed_extensions} requested)" |
|||
) |
|||
continue |
|||
yield f |
|||
|
|||
|
|||
def get_search_url(query, page_size, page): |
|||
q = urllib.parse.urlencode({"q": query}) |
|||
return f"https://archive.org/advancedsearch.php?{q}&sort[]=addeddate+desc&rows={page_size}&page={page}&output=json" |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
cli() |
@ -0,0 +1,47 @@ |
|||
[metadata] |
|||
name = archiveorg-dl |
|||
description = "An async downloader to grab content from Archive.org" |
|||
version = 0.1.dev0 |
|||
author = Eliot Berriot |
|||
author_email = contact@eliotberriot.com |
|||
url = https://dev.funkwhale.audio/funkwhale/archiveorg-dowloader |
|||
long_description = file: README.md |
|||
license = AGPL3 |
|||
keywords = downloader, network, archive |
|||
classifiers = |
|||
Development Status :: 3 - Alpha |
|||
License :: OSI Approved :: AGPL |
|||
Natural Language :: English |
|||
Programming Language :: Python :: 3.6 |
|||
|
|||
[options] |
|||
zip_safe = True |
|||
include_package_data = True |
|||
packages = find: |
|||
install_requires = |
|||
click |
|||
click-log |
|||
aiofiles |
|||
aiohttp |
|||
|
|||
[options.entry_points] |
|||
console_scripts = |
|||
archiveorg-dl = archiveorg_dl:cli |
|||
raincoat.match = |
|||
pypi = raincoat.match.pypi:PyPIMatch |
|||
django = raincoat.match.django:DjangoMatch |
|||
pygithub = raincoat.match.pygithub:PyGithubMatch |
|||
|
|||
[options.extras_require] |
|||
dev = ipdb |
|||
|
|||
|
|||
[options.packages.find] |
|||
exclude = |
|||
tests |
|||
|
|||
[bdist_wheel] |
|||
universal = 1 |
|||
|
|||
[tool:pytest] |
|||
testpaths = tests |
@ -0,0 +1,5 @@ |
|||
#!/usr/bin/env python |
|||
# -*- coding: utf-8 -*- |
|||
from setuptools import setup |
|||
|
|||
setup() |
Loading…
Reference in new issue