Browse Source

Initial commit

master
Eliot Berriot 2 years ago
commit
93175f2c9c
No known key found for this signature in database GPG Key ID: DD6965E2476E5C27
4 changed files with 286 additions and 0 deletions
  1. +28
    -0
      README.md
  2. +206
    -0
      archiveorg_dl/__init__.py
  3. +47
    -0
      setup.cfg
  4. +5
    -0
      setup.py

+ 28
- 0
README.md View File

@ -0,0 +1,28 @@
An archive.org dowloader using python and async for maximum performance.
Installation
------------
archiveorg-dl requires python >= 3.6 to work:
```bash
pip install git+https://dev.funkwhale.audio/funkwhale/archiveorg-dl.git
```
Usage
-----
```bash
# print help
archiveorg-dl --help
# download all original mp3 and ogg files from the freemusicarchive collection to /destination
archiveorg-dl download "collection:freemusicarchive AND mediatype:audio" /destination --original -e mp3 -e ogg
```
Download queries
----------------
The `download` command query argument is passed directly to archive.org. Refer
to [the corresponding documentation to craft your own queries](https://archive.org/advancedsearch.php).

+ 206
- 0
archiveorg_dl/__init__.py View File

@ -0,0 +1,206 @@
import aiofiles
import aiohttp
import asyncio
import click
import click_log
import hashlib
import logging
import math
import os
import random
import urllib.parse
logger = logging.getLogger(__name__)
click_log.basic_config(logger)
@click.group()
def cli():
pass
@cli.command()
@click.argument("query")
@click.argument(
"download_dir",
type=click.Path(
dir_okay=True, exists=True, file_okay=False, writable=True, readable=True
),
)
@click.option(
"--original/--no-original", default=True, help="Only download original files"
)
@click.option("--page-size", "-s", default=100)
@click.option(
"--parallel", "-p", default=10, help="Number of parallel downloads allowed"
)
@click.option("--extension", "-e", multiple=True, help="Restrict extension to download")
@click_log.simple_verbosity_option(logger)
def download(query, download_dir, original, page_size, parallel, extension):
"""
Download all files matching given query and formats from Archive.org.
Example query: "collection:freemusicarchive AND mediatype:audio"
"""
logger.info(f"Starting download for query '{query}'")
loop = asyncio.get_event_loop()
loop.run_until_complete(
handle_download(
query=query,
download_dir=download_dir,
parallel=parallel,
original=original,
page_size=page_size,
allowed_extensions=extension,
)
)
async def handle_download(
query, download_dir, original, page_size, parallel, allowed_extensions
):
sem = asyncio.Semaphore(parallel)
conn = aiohttp.TCPConnector()
async with aiohttp.ClientSession(connector=conn, raise_for_status=True) as session:
page = 1
total_pages = 2
url = get_search_url(query, page_size=1, page=1)
async with session.get(url) as response:
data = await response.json()
page_size = page_size
total = data["response"]["numFound"]
logger.info(f"Downloading {total} items in {download_dir}...")
page = 1
total_pages = math.ceil(total / page_size)
results = {
"total": total,
"handled": 0,
"ok": 0,
"skip": 0,
"error": 0,
"size": 0,
}
with click.progressbar(
length=total, label="Downloading items", show_pos=True, show_percent=True
) as bar:
while page <= total_pages:
url = get_search_url(query, page_size=page_size, page=page)
logger.debug(f"Fetching {url}...")
async with session.get(url) as response:
page_data = await response.json()
tasks = asyncio.gather(
*[
download_item(
download_dir=download_dir,
item_data=obj,
results=results,
bar=bar,
session=session,
original=original,
allowed_extensions=allowed_extensions,
semaphore=sem,
)
for obj in page_data["response"]["docs"]
]
)
await tasks
page += 1
async def download_item(
download_dir,
item_data,
results,
bar,
session,
original,
allowed_extensions,
semaphore,
):
async with semaphore:
files_data = await get_files_data(item_data["identifier"], session)
to_download = list(
filter_files(
files_data["result"],
original=original,
allowed_extensions=allowed_extensions,
)
)
item_dir = get_item_dir(download_dir, item_data["identifier"])
files_semaphore = asyncio.Semaphore(2)
for f in to_download:
url = f"https://archive.org/download/{item_data['identifier']}/{f['name']}"
path = os.path.join(item_dir, f["name"])
if os.path.exists(path) and await check_integrity(path, f["sha1"]):
logger.debug(f"Skipping already downloaded file at {path}")
continue
await download_file(path, url, session, semaphore=files_semaphore)
# jitter to avoid connection reset by peer from archive.org
await asyncio.sleep(random.uniform(0.3, 1))
results["ok"] += 1
bar.update(1)
def get_item_dir(download_dir, identifier):
# to avoid having tousands of directories in the same place, we chunk the identifier
# and append it to the download dir, like that: /tmp/download/id/en/if/identifier
slug = identifier.replace("-", "").replace("_", "").lower()
id_chunks = [slug[i : i + 2] for i in range(0, 8, 2)] + [identifier]
path = os.path.join(download_dir, *id_chunks)
if not os.path.exists(path):
logger.debug(f"Creating item dir {path}...")
os.makedirs(path)
return path
async def check_integrity(path, expected_checksum):
async with aiofiles.open(path, mode="rb") as f:
hash = hashlib.sha1()
hash.update(await f.read())
return expected_checksum == hash.hexdigest()
async def get_files_data(identifier, session):
url = f"https://archive.org/metadata/{identifier}/files"
logger.debug(f"Fetching files data at {url}...")
async with session.get(url) as response:
return await response.json()
async def download_file(path, url, session, semaphore):
async with semaphore:
logger.debug(f"Downloading file {url}...")
async with aiofiles.open(path, mode="wb") as f:
async with session.get(url) as response:
await f.write(await response.read())
def filter_files(files, original, allowed_extensions):
for f in files:
if original and f["source"] != "original":
logger.debug(f"Skipping not original file")
continue
if allowed_extensions:
extension = os.path.splitext(f["name"])[-1][1:]
if extension not in allowed_extensions:
logger.debug(
f"Skipping extension {extension} ({allowed_extensions} requested)"
)
continue
yield f
def get_search_url(query, page_size, page):
q = urllib.parse.urlencode({"q": query})
return f"https://archive.org/advancedsearch.php?{q}&sort[]=addeddate+desc&rows={page_size}&page={page}&output=json"
if __name__ == "__main__":
cli()

+ 47
- 0
setup.cfg View File

@ -0,0 +1,47 @@
[metadata]
name = archiveorg-dl
description = "An async downloader to grab content from Archive.org"
version = 0.1.dev0
author = Eliot Berriot
author_email = contact@eliotberriot.com
url = https://dev.funkwhale.audio/funkwhale/archiveorg-dowloader
long_description = file: README.md
license = AGPL3
keywords = downloader, network, archive
classifiers =
Development Status :: 3 - Alpha
License :: OSI Approved :: AGPL
Natural Language :: English
Programming Language :: Python :: 3.6
[options]
zip_safe = True
include_package_data = True
packages = find:
install_requires =
click
click-log
aiofiles
aiohttp
[options.entry_points]
console_scripts =
archiveorg-dl = archiveorg_dl:cli
raincoat.match =
pypi = raincoat.match.pypi:PyPIMatch
django = raincoat.match.django:DjangoMatch
pygithub = raincoat.match.pygithub:PyGithubMatch
[options.extras_require]
dev = ipdb
[options.packages.find]
exclude =
tests
[bdist_wheel]
universal = 1
[tool:pytest]
testpaths = tests

+ 5
- 0
setup.py View File

@ -0,0 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import setup
setup()

Loading…
Cancel
Save