From d75985651fe859089c503ff6f8d1e6ad540f99a2 Mon Sep 17 00:00:00 2001 From: Sunpy Date: Mon, 11 Mar 2019 16:14:03 +0100 Subject: [PATCH] Scraper --- autoSchedule/__init__.py | 19 ++++++++++++ autoSchedule/scrape.py | 64 ++++++++++++++++++++++++++++++++++++++++ config.json | 17 ++++------- main.py | 5 +++- 4 files changed, 92 insertions(+), 13 deletions(-) create mode 100644 autoSchedule/__init__.py create mode 100644 autoSchedule/scrape.py diff --git a/autoSchedule/__init__.py b/autoSchedule/__init__.py new file mode 100644 index 0000000..688e11c --- /dev/null +++ b/autoSchedule/__init__.py @@ -0,0 +1,19 @@ +from time import time, sleep +from threading import Thread + +from . import scrape + +from objects import glob + +def schedule_loop(): + if glob.config["scrape"]["run_on_startup"]: + scrape.run() + + while True: + _time = time() + glob.config["scrape"]["schedule_seconds"] + while _time - time() > 0: + sleep(1) + scrape.run() + +thread = Thread(target = schedule_loop) +thread.start() diff --git a/autoSchedule/scrape.py b/autoSchedule/scrape.py new file mode 100644 index 0000000..c9a67d9 --- /dev/null +++ b/autoSchedule/scrape.py @@ -0,0 +1,64 @@ +import urllib.request, json +import time, calendar + +from objects import glob + +finished = True + +def run(): + global finished + + if not finished: + print("[!] Scraper job is already running! ") + return + + print("[!] Starting scraping job...") + sql = glob.new_sql() + cur = sql.cursor() + + cur.execute("SELECT max(file_version) f FROM updates") + target = cur.fetchone()["f"] + + finished = False + failed_streak = 0 + while not finished: + target += 1 + + attempts = 0 + extra_sleep = 0 + while attempts < glob.config["scrape"]["max_attempts"]: + try: + with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target=%s" % target) as url: + data = json.loads(url.read().decode())[0] + if "url_patch" not in data.keys(): + data["url_patch"] = None + cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES ('%s','%s','%s','%s',%s,'%s','%s','%s')" % + ( + data["file_version"], + data["filename"], + data["file_hash"], + data["filesize"], + calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")), + data["patch_id"], + data["url_full"], + data["url_patch"], + )) + sql.commit() + failed_streak = 0 + print("[Scraper] Target: %s, Status: OK" % target) + break + except: + attempts += 1 + failed_streak += 1 + if glob.config["scrape"]["increase_delay_on_fail"]: + extra_sleep = attempts + print("[Scraper] Target: %s, status: FAILED, Attempt: %s" % (target, attempts)) + + time.sleep(glob.config["scrape"]["delay"] + extra_sleep) + if failed_streak > glob.config["scrape"]["skips_until_finished"]: + finished = True + break + + cur.close() + sql.close() + print("[Scraper] Finished!") diff --git a/config.json b/config.json index 4919365..ddaca14 100644 --- a/config.json +++ b/config.json @@ -3,23 +3,16 @@ "host": "0.0.0.0", "port": 3003 }, - "sql": { - "host": "127.0.0.1", - "user": "root", - "passwd": "toor", - "db": "osu-wayback" - }, "scrape": { "delay": 1, - "max_attempts": 3, - "increase_delay_on_fail": true + "max_attempts": 3, + "increase_delay_on_fail": true, + "skips_until_finished": 17, + "schedule_seconds": 68400, + "run_on_startup": true }, "downloader": { "download_folder": "/home/wayback/files" }, - "zipper": { - "temp_folder": "/home/wayback/tmp", - "output_folder": "/home/wayback/archive" - }, "threads": 4 } \ No newline at end of file diff --git a/main.py b/main.py index a1607dd..635f437 100644 --- a/main.py +++ b/main.py @@ -76,4 +76,7 @@ if __name__ == "__main__": print("To stop server press CTRL + C") glob.app.listen(glob.config["server"]["port"], address=glob.config["server"]["host"]) tornado.log.enable_pretty_logging() - tornado.ioloop.IOLoop.instance().start() \ No newline at end of file + + import autoSchedule + + tornado.ioloop.IOLoop.instance().start()