From 8309c9a5619ccee7648af54e08f462ca3a58e728 Mon Sep 17 00:00:00 2001 From: Sunpy Date: Fri, 12 Jan 2018 22:28:27 +0100 Subject: [PATCH] Added osu file scraper --- config.json | 13 +++++++++++ memory.json | 1 + scrape.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++ updates.sql | 13 +++++++++++ 4 files changed, 94 insertions(+) create mode 100644 config.json create mode 100644 memory.json create mode 100644 scrape.py create mode 100644 updates.sql diff --git a/config.json b/config.json new file mode 100644 index 0000000..bd6b023 --- /dev/null +++ b/config.json @@ -0,0 +1,13 @@ +{ + "sql": { + "host": "127.0.0.1", + "user": "root", + "passwd": "toor", + "db": "osu-wayback" + }, + "scrape": { + "delay": 1, + "max_attempts": 3, + "increase_delay_on_fail": true + } +} \ No newline at end of file diff --git a/memory.json b/memory.json new file mode 100644 index 0000000..0b69e63 --- /dev/null +++ b/memory.json @@ -0,0 +1 @@ +{"scrape": {"last": 0, "failed": []}} \ No newline at end of file diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..de1d9ac --- /dev/null +++ b/scrape.py @@ -0,0 +1,67 @@ +#Used for scraping osu files from osu.ppy.sh and storing them into self database + +import urllib.request, json +import MySQLdb +import MySQLdb.cursors +import time +import atexit + +finished = False + +with open("config.json", "r") as f: + config = json.load(f) + +with open("memory.json", "r") as f: + memory = json.load(f) + +sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor) + +cur = sql.cursor() + +def on_close(): + with open("memory.json", "w") as f: + json.dump(memory, f) + print("Closing...") + +atexit.register(on_close) + +failed_streak = 0 + +while not finished: + target = memory["scrape"]["last"] + 1 + attempts = 0 + completed = False + extra_sleep = 0 + while attempts < config["scrape"]["max_attempts"] and not completed: + try: + with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target={}".format(target)) as url: + data = json.loads(url.read().decode())[0] + if "url_patch" not in data.keys(): + data["url_patch"] = None + cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", + [ + data["file_version"], + data["filename"], + data["file_hash"], + data["filesize"], + data["timestamp"], + data["patch_id"], + data["url_full"], + data["url_patch"] + ] + ) + completed = True + failed_streak = 0 + print("target: {}, status: OK".format(target)) + except: + if target not in memory["scrape"]["failed"]: + memory["scrape"]["failed"].append(target) + attempts += 1 + failed_streak += 1 + if config["scrape"]["increase_delay_on_fail"]: + extra_sleep = attempts + print("target: {}, status: FAILED, attempt: {}".format(target, attempts)) + time.sleep(config["scrape"]["delay"] + extra_sleep) + if failed_streak > 10: + exit() + memory["scrape"]["last"] = target \ No newline at end of file diff --git a/updates.sql b/updates.sql new file mode 100644 index 0000000..17d3b00 --- /dev/null +++ b/updates.sql @@ -0,0 +1,13 @@ +CREATE TABLE `updates` ( + `file_version` int(11) NOT NULL, + `filename` varchar(32) NOT NULL, + `file_hash` varchar(32) NOT NULL, + `filesize` int(11) NOT NULL, + `timestamp` datetime NOT NULL, + `patch_id` int(11) DEFAULT NULL, + `url_full` varchar(128) NOT NULL, + `url_patch` varchar(128) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1; + +ALTER TABLE `updates` + ADD PRIMARY KEY (`file_version`); \ No newline at end of file