Scraper
This commit is contained in:
parent
1948e229ec
commit
d75985651f
19
autoSchedule/__init__.py
Normal file
19
autoSchedule/__init__.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from time import time, sleep
|
||||
from threading import Thread
|
||||
|
||||
from . import scrape
|
||||
|
||||
from objects import glob
|
||||
|
||||
def schedule_loop():
|
||||
if glob.config["scrape"]["run_on_startup"]:
|
||||
scrape.run()
|
||||
|
||||
while True:
|
||||
_time = time() + glob.config["scrape"]["schedule_seconds"]
|
||||
while _time - time() > 0:
|
||||
sleep(1)
|
||||
scrape.run()
|
||||
|
||||
thread = Thread(target = schedule_loop)
|
||||
thread.start()
|
64
autoSchedule/scrape.py
Normal file
64
autoSchedule/scrape.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
import urllib.request, json
|
||||
import time, calendar
|
||||
|
||||
from objects import glob
|
||||
|
||||
finished = True
|
||||
|
||||
def run():
|
||||
global finished
|
||||
|
||||
if not finished:
|
||||
print("[!] Scraper job is already running! ")
|
||||
return
|
||||
|
||||
print("[!] Starting scraping job...")
|
||||
sql = glob.new_sql()
|
||||
cur = sql.cursor()
|
||||
|
||||
cur.execute("SELECT max(file_version) f FROM updates")
|
||||
target = cur.fetchone()["f"]
|
||||
|
||||
finished = False
|
||||
failed_streak = 0
|
||||
while not finished:
|
||||
target += 1
|
||||
|
||||
attempts = 0
|
||||
extra_sleep = 0
|
||||
while attempts < glob.config["scrape"]["max_attempts"]:
|
||||
try:
|
||||
with urllib.request.urlopen("https://osu.ppy.sh/web/check-updates.php?action=path&stream=stable&target=%s" % target) as url:
|
||||
data = json.loads(url.read().decode())[0]
|
||||
if "url_patch" not in data.keys():
|
||||
data["url_patch"] = None
|
||||
cur.execute("INSERT INTO updates (file_version,filename,file_hash,filesize,timestamp,patch_id,url_full,url_patch) VALUES ('%s','%s','%s','%s',%s,'%s','%s','%s')" %
|
||||
(
|
||||
data["file_version"],
|
||||
data["filename"],
|
||||
data["file_hash"],
|
||||
data["filesize"],
|
||||
calendar.timegm(time.strptime(data["timestamp"], "%Y-%m-%d %H:%M:%S")),
|
||||
data["patch_id"],
|
||||
data["url_full"],
|
||||
data["url_patch"],
|
||||
))
|
||||
sql.commit()
|
||||
failed_streak = 0
|
||||
print("[Scraper] Target: %s, Status: OK" % target)
|
||||
break
|
||||
except:
|
||||
attempts += 1
|
||||
failed_streak += 1
|
||||
if glob.config["scrape"]["increase_delay_on_fail"]:
|
||||
extra_sleep = attempts
|
||||
print("[Scraper] Target: %s, status: FAILED, Attempt: %s" % (target, attempts))
|
||||
|
||||
time.sleep(glob.config["scrape"]["delay"] + extra_sleep)
|
||||
if failed_streak > glob.config["scrape"]["skips_until_finished"]:
|
||||
finished = True
|
||||
break
|
||||
|
||||
cur.close()
|
||||
sql.close()
|
||||
print("[Scraper] Finished!")
|
17
config.json
17
config.json
|
@ -3,23 +3,16 @@
|
|||
"host": "0.0.0.0",
|
||||
"port": 3003
|
||||
},
|
||||
"sql": {
|
||||
"host": "127.0.0.1",
|
||||
"user": "root",
|
||||
"passwd": "toor",
|
||||
"db": "osu-wayback"
|
||||
},
|
||||
"scrape": {
|
||||
"delay": 1,
|
||||
"max_attempts": 3,
|
||||
"increase_delay_on_fail": true
|
||||
"max_attempts": 3,
|
||||
"increase_delay_on_fail": true,
|
||||
"skips_until_finished": 17,
|
||||
"schedule_seconds": 68400,
|
||||
"run_on_startup": true
|
||||
},
|
||||
"downloader": {
|
||||
"download_folder": "/home/wayback/files"
|
||||
},
|
||||
"zipper": {
|
||||
"temp_folder": "/home/wayback/tmp",
|
||||
"output_folder": "/home/wayback/archive"
|
||||
},
|
||||
"threads": 4
|
||||
}
|
Loading…
Reference in New Issue
Block a user