Added downloader script

This commit is contained in:
Emily 2018-05-23 23:44:30 +02:00
parent 1617b60ea6
commit b0f3b10c34
3 changed files with 61 additions and 2 deletions

View File

@ -16,5 +16,8 @@
"delay": 1,
"max_attempts": 3,
"increase_delay_on_fail": true
}
},
"downloader": {
"download_folder": "/home/wayback/files"
}
}

47
downloader.py Normal file
View File

@ -0,0 +1,47 @@
#Used to download all the files into local folders after scrape data has been saved
import urllib.request, json
import MySQLdb
import MySQLdb.cursors
import os
import atexit
with open("config.json", "r") as f:
config = json.load(f)
with open("memory.json", "r") as f:
memory = json.load(f)
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
cur = sql.cursor()
def on_close():
with open("memory.json", "w") as f:
json.dump(memory, f)
print("Closing...")
atexit.register(on_close)
cur.execute("SELECT file_version,filename,file_hash,url_full FROM updates")
data = cur.fetchall()
# Remove already downloaded files (checked from memory.json)
data = data[memory["downloader"]["last"]:]
for row in data:
try:
print("Downloading {} with id {}".format(row["filename"], row["file_version"]))
urllib.request.urlretrieve(
row["url_full"],
os.path.join(
config["downloader"]["download_folder"],
row["filename"],
"f_" + row["file_hash"]
)
)
print("Done.")
except Exception as e:
memory["downloader"]["failed"].append(row["file_version"])
print("Error downloading file {}: {}".format(row["file_version"], e))
memory["downloader"]["last"] += 1

View File

@ -1 +1,10 @@
{"scrape": {"last": 0, "failed": []}}
{
"scrape": {
"last": 0,
"failed": []
},
"downloader": {
"last": 0,
"failed": []
}
}