Added osu file scraper

This commit is contained in:
Sunpy 2018-01-12 22:28:27 +01:00
parent 80efb8b222
commit 8309c9a561
4 changed files with 94 additions and 0 deletions

13
config.json Normal file
View File

@ -0,0 +1,13 @@
{
"sql": {
"host": "127.0.0.1",
"user": "root",
"passwd": "toor",
"db": "osu-wayback"
},
"scrape": {
"delay": 1,
"max_attempts": 3,
"increase_delay_on_fail": true
}
}

1
memory.json Normal file
View File

@ -0,0 +1 @@
{"scrape": {"last": 0, "failed": []}}

67
scrape.py Normal file
View File

@ -0,0 +1,67 @@
#Used for scraping osu files from osu.ppy.sh and storing them into self database
import urllib.request, json
import MySQLdb
import MySQLdb.cursors
import time
import atexit
finished = False
with open("config.json", "r") as f:
config = json.load(f)
with open("memory.json", "r") as f:
memory = json.load(f)
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
cur = sql.cursor()
def on_close():
with open("memory.json", "w") as f:
json.dump(memory, f)
print("Closing...")
atexit.register(on_close)
failed_streak = 0
while not finished:
target = memory["scrape"]["last"] + 1
attempts = 0
completed = False
extra_sleep = 0
while attempts < config["scrape"]["max_attempts"] and not completed:
try:
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target={}".format(target)) as url:
data = json.loads(url.read().decode())[0]
if "url_patch" not in data.keys():
data["url_patch"] = None
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
[
data["file_version"],
data["filename"],
data["file_hash"],
data["filesize"],
data["timestamp"],
data["patch_id"],
data["url_full"],
data["url_patch"]
]
)
completed = True
failed_streak = 0
print("target: {}, status: OK".format(target))
except:
if target not in memory["scrape"]["failed"]:
memory["scrape"]["failed"].append(target)
attempts += 1
failed_streak += 1
if config["scrape"]["increase_delay_on_fail"]:
extra_sleep = attempts
print("target: {}, status: FAILED, attempt: {}".format(target, attempts))
time.sleep(config["scrape"]["delay"] + extra_sleep)
if failed_streak > 10:
exit()
memory["scrape"]["last"] = target

13
updates.sql Normal file
View File

@ -0,0 +1,13 @@
CREATE TABLE `updates` (
`file_version` int(11) NOT NULL,
`filename` varchar(32) NOT NULL,
`file_hash` varchar(32) NOT NULL,
`filesize` int(11) NOT NULL,
`timestamp` datetime NOT NULL,
`patch_id` int(11) DEFAULT NULL,
`url_full` varchar(128) NOT NULL,
`url_patch` varchar(128) DEFAULT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
ALTER TABLE `updates`
ADD PRIMARY KEY (`file_version`);