Added osu file scraper
This commit is contained in:
parent
80efb8b222
commit
8309c9a561
13
config.json
Normal file
13
config.json
Normal file
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"sql": {
|
||||
"host": "127.0.0.1",
|
||||
"user": "root",
|
||||
"passwd": "toor",
|
||||
"db": "osu-wayback"
|
||||
},
|
||||
"scrape": {
|
||||
"delay": 1,
|
||||
"max_attempts": 3,
|
||||
"increase_delay_on_fail": true
|
||||
}
|
||||
}
|
1
memory.json
Normal file
1
memory.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"scrape": {"last": 0, "failed": []}}
|
67
scrape.py
Normal file
67
scrape.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
#Used for scraping osu files from osu.ppy.sh and storing them into self database
|
||||
|
||||
import urllib.request, json
|
||||
import MySQLdb
|
||||
import MySQLdb.cursors
|
||||
import time
|
||||
import atexit
|
||||
|
||||
finished = False
|
||||
|
||||
with open("config.json", "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
with open("memory.json", "r") as f:
|
||||
memory = json.load(f)
|
||||
|
||||
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
|
||||
|
||||
cur = sql.cursor()
|
||||
|
||||
def on_close():
|
||||
with open("memory.json", "w") as f:
|
||||
json.dump(memory, f)
|
||||
print("Closing...")
|
||||
|
||||
atexit.register(on_close)
|
||||
|
||||
failed_streak = 0
|
||||
|
||||
while not finished:
|
||||
target = memory["scrape"]["last"] + 1
|
||||
attempts = 0
|
||||
completed = False
|
||||
extra_sleep = 0
|
||||
while attempts < config["scrape"]["max_attempts"] and not completed:
|
||||
try:
|
||||
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target={}".format(target)) as url:
|
||||
data = json.loads(url.read().decode())[0]
|
||||
if "url_patch" not in data.keys():
|
||||
data["url_patch"] = None
|
||||
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
|
||||
[
|
||||
data["file_version"],
|
||||
data["filename"],
|
||||
data["file_hash"],
|
||||
data["filesize"],
|
||||
data["timestamp"],
|
||||
data["patch_id"],
|
||||
data["url_full"],
|
||||
data["url_patch"]
|
||||
]
|
||||
)
|
||||
completed = True
|
||||
failed_streak = 0
|
||||
print("target: {}, status: OK".format(target))
|
||||
except:
|
||||
if target not in memory["scrape"]["failed"]:
|
||||
memory["scrape"]["failed"].append(target)
|
||||
attempts += 1
|
||||
failed_streak += 1
|
||||
if config["scrape"]["increase_delay_on_fail"]:
|
||||
extra_sleep = attempts
|
||||
print("target: {}, status: FAILED, attempt: {}".format(target, attempts))
|
||||
time.sleep(config["scrape"]["delay"] + extra_sleep)
|
||||
if failed_streak > 10:
|
||||
exit()
|
||||
memory["scrape"]["last"] = target
|
13
updates.sql
Normal file
13
updates.sql
Normal file
|
@ -0,0 +1,13 @@
|
|||
CREATE TABLE `updates` (
|
||||
`file_version` int(11) NOT NULL,
|
||||
`filename` varchar(32) NOT NULL,
|
||||
`file_hash` varchar(32) NOT NULL,
|
||||
`filesize` int(11) NOT NULL,
|
||||
`timestamp` datetime NOT NULL,
|
||||
`patch_id` int(11) DEFAULT NULL,
|
||||
`url_full` varchar(128) NOT NULL,
|
||||
`url_patch` varchar(128) DEFAULT NULL
|
||||
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
|
||||
|
||||
ALTER TABLE `updates`
|
||||
ADD PRIMARY KEY (`file_version`);
|
Loading…
Reference in New Issue
Block a user