diff --git a/config.json b/config.json index 0a4151f..7e8f8e4 100644 --- a/config.json +++ b/config.json @@ -16,5 +16,8 @@ "delay": 1, "max_attempts": 3, "increase_delay_on_fail": true - } + }, + "downloader": { + "download_folder": "/home/wayback/files" + } } \ No newline at end of file diff --git a/downloader.py b/downloader.py new file mode 100644 index 0000000..248543d --- /dev/null +++ b/downloader.py @@ -0,0 +1,47 @@ +#Used to download all the files into local folders after scrape data has been saved + +import urllib.request, json +import MySQLdb +import MySQLdb.cursors +import os +import atexit + +with open("config.json", "r") as f: + config = json.load(f) + +with open("memory.json", "r") as f: + memory = json.load(f) + +sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor) + +cur = sql.cursor() + +def on_close(): + with open("memory.json", "w") as f: + json.dump(memory, f) + print("Closing...") + +atexit.register(on_close) + +cur.execute("SELECT file_version,filename,file_hash,url_full FROM updates") +data = cur.fetchall() + +# Remove already downloaded files (checked from memory.json) +data = data[memory["downloader"]["last"]:] + +for row in data: + try: + print("Downloading {} with id {}".format(row["filename"], row["file_version"])) + urllib.request.urlretrieve( + row["url_full"], + os.path.join( + config["downloader"]["download_folder"], + row["filename"], + "f_" + row["file_hash"] + ) + ) + print("Done.") + except Exception as e: + memory["downloader"]["failed"].append(row["file_version"]) + print("Error downloading file {}: {}".format(row["file_version"], e)) + memory["downloader"]["last"] += 1 \ No newline at end of file diff --git a/memory.json b/memory.json index 0b69e63..7500ded 100644 --- a/memory.json +++ b/memory.json @@ -1 +1,10 @@ -{"scrape": {"last": 0, "failed": []}} \ No newline at end of file +{ + "scrape": { + "last": 0, + "failed": [] + }, + "downloader": { + "last": 0, + "failed": [] + } +} \ No newline at end of file