Compare commits
No commits in common. "05ecccf0da08e843a11b325738be0eaab27bf752" and "1948e229ecc38b2df7a638bcaf33d3b7e71bc688" have entirely different histories.
05ecccf0da
...
1948e229ec
|
@ -1,19 +0,0 @@
|
||||||
from time import time, sleep
|
|
||||||
from threading import Thread
|
|
||||||
|
|
||||||
from . import scrape
|
|
||||||
|
|
||||||
from objects import glob
|
|
||||||
|
|
||||||
def schedule_loop():
|
|
||||||
if glob.config["scrape"]["run_on_startup"]:
|
|
||||||
scrape.run()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
_time = time() + glob.config["scrape"]["schedule_seconds"]
|
|
||||||
while _time - time() > 0:
|
|
||||||
sleep(1)
|
|
||||||
scrape.run()
|
|
||||||
|
|
||||||
thread = Thread(target = schedule_loop)
|
|
||||||
thread.start()
|
|
|
@ -1,64 +0,0 @@
|
||||||
import urllib.request, json
|
|
||||||
import time, calendar
|
|
||||||
|
|
||||||
from objects import glob
|
|
||||||
|
|
||||||
finished = True
|
|
||||||
|
|
||||||
def run():
|
|
||||||
global finished
|
|
||||||
|
|
||||||
if not finished:
|
|
||||||
print("[!] Scraper job is already running! ")
|
|
||||||
return
|
|
||||||
|
|
||||||
print("[!] Starting scraping job...")
|
|
||||||
sql = glob.new_sql()
|
|
||||||
cur = sql.cursor()
|
|
||||||
|
|
||||||
cur.execute("SELECT max(file_version) f FROM updates")
|
|
||||||
target = cur.fetchone()["f"]
|
|
||||||
|
|
||||||
finished = False
|
|
||||||
failed_streak = 0
|
|
||||||
while not finished:
|
|
||||||
target += 1
|
|
||||||
|
|
||||||
attempts = 0
|
|
||||||
extra_sleep = 0
|
|
||||||
while attempts < glob.config["scrape"]["max_attempts"]:
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target=%s" % target) as url:
|
|
||||||
data = json.loads(url.read().decode())[0]
|
|
||||||
if "url_patch" not in data.keys():
|
|
||||||
data["url_patch"] = None
|
|
||||||
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES ('%s','%s','%s','%s',%s,'%s','%s','%s')" %
|
|
||||||
(
|
|
||||||
data["file_version"],
|
|
||||||
data["filename"],
|
|
||||||
data["file_hash"],
|
|
||||||
data["filesize"],
|
|
||||||
calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
|
|
||||||
data["patch_id"],
|
|
||||||
data["url_full"],
|
|
||||||
data["url_patch"],
|
|
||||||
))
|
|
||||||
sql.commit()
|
|
||||||
failed_streak = 0
|
|
||||||
print("[Scraper] Target: %s, Status: OK" % target)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
attempts += 1
|
|
||||||
failed_streak += 1
|
|
||||||
if glob.config["scrape"]["increase_delay_on_fail"]:
|
|
||||||
extra_sleep = attempts
|
|
||||||
print("[Scraper] Target: %s, status: FAILED, Attempt: %s" % (target, attempts))
|
|
||||||
|
|
||||||
time.sleep(glob.config["scrape"]["delay"] + extra_sleep)
|
|
||||||
if failed_streak > glob.config["scrape"]["skips_until_finished"]:
|
|
||||||
finished = True
|
|
||||||
break
|
|
||||||
|
|
||||||
cur.close()
|
|
||||||
sql.close()
|
|
||||||
print("[Scraper] Finished!")
|
|
17
config.json
17
config.json
|
@ -3,16 +3,23 @@
|
||||||
"host": "0.0.0.0",
|
"host": "0.0.0.0",
|
||||||
"port": 3003
|
"port": 3003
|
||||||
},
|
},
|
||||||
|
"sql": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"user": "root",
|
||||||
|
"passwd": "toor",
|
||||||
|
"db": "osu-wayback"
|
||||||
|
},
|
||||||
"scrape": {
|
"scrape": {
|
||||||
"delay": 1,
|
"delay": 1,
|
||||||
"max_attempts": 3,
|
"max_attempts": 3,
|
||||||
"increase_delay_on_fail": true,
|
"increase_delay_on_fail": true
|
||||||
"skips_until_finished": 17,
|
|
||||||
"schedule_seconds": 68400,
|
|
||||||
"run_on_startup": true
|
|
||||||
},
|
},
|
||||||
"downloader": {
|
"downloader": {
|
||||||
"download_folder": "/home/wayback/files"
|
"download_folder": "/home/wayback/files"
|
||||||
},
|
},
|
||||||
|
"zipper": {
|
||||||
|
"temp_folder": "/home/wayback/tmp",
|
||||||
|
"output_folder": "/home/wayback/archive"
|
||||||
|
},
|
||||||
"threads": 4
|
"threads": 4
|
||||||
}
|
}
|
3
main.py
3
main.py
|
@ -76,7 +76,4 @@ if __name__ == "__main__":
|
||||||
print("To stop server press CTRL + C")
|
print("To stop server press CTRL + C")
|
||||||
glob.app.listen(glob.config["server"]["port"], address=glob.config["server"]["host"])
|
glob.app.listen(glob.config["server"]["port"], address=glob.config["server"]["host"])
|
||||||
tornado.log.enable_pretty_logging()
|
tornado.log.enable_pretty_logging()
|
||||||
|
|
||||||
import autoSchedule
|
|
||||||
|
|
||||||
tornado.ioloop.IOLoop.instance().start()
|
tornado.ioloop.IOLoop.instance().start()
|
67
scrape.py
Normal file
67
scrape.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
#Used for scraping osu files from osu.ppy.sh and storing them into self database
|
||||||
|
|
||||||
|
import urllib.request, json
|
||||||
|
import MySQLdb
|
||||||
|
import MySQLdb.cursors
|
||||||
|
import time, calendar
|
||||||
|
import atexit
|
||||||
|
|
||||||
|
finished = False
|
||||||
|
|
||||||
|
with open("config.json", "r") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
with open("memory.json", "r") as f:
|
||||||
|
memory = json.load(f)
|
||||||
|
|
||||||
|
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
|
||||||
|
|
||||||
|
cur = sql.cursor()
|
||||||
|
|
||||||
|
def on_close():
|
||||||
|
with open("memory.json", "w") as f:
|
||||||
|
json.dump(memory, f)
|
||||||
|
print("Closing...")
|
||||||
|
|
||||||
|
atexit.register(on_close)
|
||||||
|
|
||||||
|
failed_streak = 0
|
||||||
|
|
||||||
|
while not finished:
|
||||||
|
target = memory["scrape"]["last"] + 1
|
||||||
|
attempts = 0
|
||||||
|
completed = False
|
||||||
|
extra_sleep = 0
|
||||||
|
while attempts < config["scrape"]["max_attempts"] and not completed:
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target={}".format(target)) as url:
|
||||||
|
data = json.loads(url.read().decode())[0]
|
||||||
|
if "url_patch" not in data.keys():
|
||||||
|
data["url_patch"] = None
|
||||||
|
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
|
||||||
|
[
|
||||||
|
data["file_version"],
|
||||||
|
data["filename"],
|
||||||
|
data["file_hash"],
|
||||||
|
data["filesize"],
|
||||||
|
calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
|
||||||
|
data["patch_id"],
|
||||||
|
data["url_full"],
|
||||||
|
data["url_patch"]
|
||||||
|
])
|
||||||
|
sql.commit()
|
||||||
|
completed = True
|
||||||
|
failed_streak = 0
|
||||||
|
print("target: {}, status: OK".format(target))
|
||||||
|
except:
|
||||||
|
if target not in memory["scrape"]["failed"]:
|
||||||
|
memory["scrape"]["failed"].append(target)
|
||||||
|
attempts += 1
|
||||||
|
failed_streak += 1
|
||||||
|
if config["scrape"]["increase_delay_on_fail"]:
|
||||||
|
extra_sleep = attempts
|
||||||
|
print("target: {}, status: FAILED, attempt: {}".format(target, attempts))
|
||||||
|
time.sleep(config["scrape"]["delay"] + extra_sleep)
|
||||||
|
if failed_streak > 100:
|
||||||
|
exit()
|
||||||
|
memory["scrape"]["last"] = target
|
50
zipper.py
Normal file
50
zipper.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
#Used to zip all the files into new local folders after downloader is done
|
||||||
|
|
||||||
|
import urllib.request, json
|
||||||
|
import MySQLdb
|
||||||
|
import MySQLdb.cursors
|
||||||
|
import os
|
||||||
|
import atexit
|
||||||
|
|
||||||
|
with open("config.json", "r") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
with open("memory.json", "r") as f:
|
||||||
|
memory = json.load(f)
|
||||||
|
|
||||||
|
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
|
||||||
|
|
||||||
|
cur = sql.cursor()
|
||||||
|
|
||||||
|
def on_close():
|
||||||
|
with open("memory.json", "w") as f:
|
||||||
|
json.dump(memory, f)
|
||||||
|
print("Closing...")
|
||||||
|
|
||||||
|
atexit.register(on_close)
|
||||||
|
|
||||||
|
cur.execute("SELECT file_version,filename,file_hash,url_full FROM updates")
|
||||||
|
data = cur.fetchall()
|
||||||
|
|
||||||
|
# Remove already downloaded files (checked from memory.json)
|
||||||
|
data = data[memory["zipper"]["last"]:]
|
||||||
|
|
||||||
|
# Unfinished - replace with zipper code
|
||||||
|
"""
|
||||||
|
for row in data:
|
||||||
|
try:
|
||||||
|
print("Downloading {} with id {}".format(row["filename"], row["file_version"]))
|
||||||
|
urllib.request.urlretrieve(
|
||||||
|
row["url_full"],
|
||||||
|
os.path.join(
|
||||||
|
config["downloader"]["download_folder"],
|
||||||
|
row["filename"],
|
||||||
|
"f_" + row["file_hash"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print("Done.")
|
||||||
|
except Exception as e:
|
||||||
|
memory["downloader"]["failed"].append(row["file_version"])
|
||||||
|
print("Error downloading file {}: {}".format(row["file_version"], e))
|
||||||
|
memory["downloader"]["last"] += 1
|
||||||
|
"""
|
Loading…
Reference in New Issue
Block a user