Compare commits

...

2 Commits

Author SHA1 Message Date
05ecccf0da Del old files 2019-03-11 16:21:31 +01:00
d75985651f Scraper 2019-03-11 16:14:03 +01:00
6 changed files with 92 additions and 130 deletions

19
autoSchedule/__init__.py Normal file
View File

@ -0,0 +1,19 @@
from time import time, sleep
from threading import Thread
from . import scrape
from objects import glob
def schedule_loop():
if glob.config["scrape"]["run_on_startup"]:
scrape.run()
while True:
_time = time() + glob.config["scrape"]["schedule_seconds"]
while _time - time() > 0:
sleep(1)
scrape.run()
thread = Thread(target = schedule_loop)
thread.start()

64
autoSchedule/scrape.py Normal file
View File

@ -0,0 +1,64 @@
import urllib.request, json
import time, calendar
from objects import glob
finished = True
def run():
global finished
if not finished:
print("[!] Scraper job is already running! ")
return
print("[!] Starting scraping job...")
sql = glob.new_sql()
cur = sql.cursor()
cur.execute("SELECT max(file_version) f FROM updates")
target = cur.fetchone()["f"]
finished = False
failed_streak = 0
while not finished:
target += 1
attempts = 0
extra_sleep = 0
while attempts < glob.config["scrape"]["max_attempts"]:
try:
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target=%s" % target) as url:
data = json.loads(url.read().decode())[0]
if "url_patch" not in data.keys():
data["url_patch"] = None
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES ('%s','%s','%s','%s',%s,'%s','%s','%s')" %
(
data["file_version"],
data["filename"],
data["file_hash"],
data["filesize"],
calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
data["patch_id"],
data["url_full"],
data["url_patch"],
))
sql.commit()
failed_streak = 0
print("[Scraper] Target: %s, Status: OK" % target)
break
except:
attempts += 1
failed_streak += 1
if glob.config["scrape"]["increase_delay_on_fail"]:
extra_sleep = attempts
print("[Scraper] Target: %s, status: FAILED, Attempt: %s" % (target, attempts))
time.sleep(glob.config["scrape"]["delay"] + extra_sleep)
if failed_streak > glob.config["scrape"]["skips_until_finished"]:
finished = True
break
cur.close()
sql.close()
print("[Scraper] Finished!")

View File

@ -3,23 +3,16 @@
"host": "0.0.0.0", "host": "0.0.0.0",
"port": 3003 "port": 3003
}, },
"sql": {
"host": "127.0.0.1",
"user": "root",
"passwd": "toor",
"db": "osu-wayback"
},
"scrape": { "scrape": {
"delay": 1, "delay": 1,
"max_attempts": 3, "max_attempts": 3,
"increase_delay_on_fail": true "increase_delay_on_fail": true,
"skips_until_finished": 17,
"schedule_seconds": 68400,
"run_on_startup": true
}, },
"downloader": { "downloader": {
"download_folder": "/home/wayback/files" "download_folder": "/home/wayback/files"
}, },
"zipper": {
"temp_folder": "/home/wayback/tmp",
"output_folder": "/home/wayback/archive"
},
"threads": 4 "threads": 4
} }

View File

@ -76,4 +76,7 @@ if __name__ == "__main__":
print("To stop server press CTRL + C") print("To stop server press CTRL + C")
glob.app.listen(glob.config["server"]["port"], address=glob.config["server"]["host"]) glob.app.listen(glob.config["server"]["port"], address=glob.config["server"]["host"])
tornado.log.enable_pretty_logging() tornado.log.enable_pretty_logging()
tornado.ioloop.IOLoop.instance().start()
import autoSchedule
tornado.ioloop.IOLoop.instance().start()

View File

@ -1,67 +0,0 @@
#Used for scraping osu files from osu.ppy.sh and storing them into self database
import urllib.request, json
import MySQLdb
import MySQLdb.cursors
import time, calendar
import atexit
finished = False
with open("config.json", "r") as f:
config = json.load(f)
with open("memory.json", "r") as f:
memory = json.load(f)
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
cur = sql.cursor()
def on_close():
with open("memory.json", "w") as f:
json.dump(memory, f)
print("Closing...")
atexit.register(on_close)
failed_streak = 0
while not finished:
target = memory["scrape"]["last"] + 1
attempts = 0
completed = False
extra_sleep = 0
while attempts < config["scrape"]["max_attempts"] and not completed:
try:
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target={}".format(target)) as url:
data = json.loads(url.read().decode())[0]
if "url_patch" not in data.keys():
data["url_patch"] = None
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
[
data["file_version"],
data["filename"],
data["file_hash"],
data["filesize"],
calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
data["patch_id"],
data["url_full"],
data["url_patch"]
])
sql.commit()
completed = True
failed_streak = 0
print("target: {}, status: OK".format(target))
except:
if target not in memory["scrape"]["failed"]:
memory["scrape"]["failed"].append(target)
attempts += 1
failed_streak += 1
if config["scrape"]["increase_delay_on_fail"]:
extra_sleep = attempts
print("target: {}, status: FAILED, attempt: {}".format(target, attempts))
time.sleep(config["scrape"]["delay"] + extra_sleep)
if failed_streak > 100:
exit()
memory["scrape"]["last"] = target

View File

@ -1,50 +0,0 @@
#Used to zip all the files into new local folders after downloader is done
import urllib.request, json
import MySQLdb
import MySQLdb.cursors
import os
import atexit
with open("config.json", "r") as f:
config = json.load(f)
with open("memory.json", "r") as f:
memory = json.load(f)
sql = MySQLdb.connect(**config["sql"], cursorclass = MySQLdb.cursors.DictCursor)
cur = sql.cursor()
def on_close():
with open("memory.json", "w") as f:
json.dump(memory, f)
print("Closing...")
atexit.register(on_close)
cur.execute("SELECT file_version,filename,file_hash,url_full FROM updates")
data = cur.fetchall()
# Remove already downloaded files (checked from memory.json)
data = data[memory["zipper"]["last"]:]
# Unfinished - replace with zipper code
"""
for row in data:
try:
print("Downloading {} with id {}".format(row["filename"], row["file_version"]))
urllib.request.urlretrieve(
row["url_full"],
os.path.join(
config["downloader"]["download_folder"],
row["filename"],
"f_" + row["file_hash"]
)
)
print("Done.")
except Exception as e:
memory["downloader"]["failed"].append(row["file_version"])
print("Error downloading file {}: {}".format(row["file_version"], e))
memory["downloader"]["last"] += 1
"""