Commit de1a698c authored by TheTechRobo's avatar TheTechRobo
Browse files

New experiments system for me to find edge cases

Starting with WBM's video availability endpoints.
parent e8a82198
Loading
Loading
Loading
Loading

EXPERIMENTS.txt

0 → 100644
+4 −0
Original line number Diff line number Diff line
=== Current Experiments ===

- Wayback Machine: Checks two supposedly equivalent API endpoints. If they don't match, send a report to my server. I have been made aware of a video that is not recognized on the fakeurl endpoint but is recognized on the videoinfo endpoint. Only the video ID is included in the report.
+11 −0
Original line number Diff line number Diff line
@@ -67,3 +67,14 @@ methods:

# Global User-Agent
user_agent: "FindYoutubeVideo/1.0 operated by XYZ"

# Sets the experiment base URL. *If you want to disable experiments, set this to null.*
# Occasionally, there might be something I want to test; for example, looking for
# edge cases in an API endpoint.
# If experiments are on, the site will send the results of these experiments to my server.
# The following information will be included in the report:
# - video ID
# - any necessary additional information
# Your IP address WILL NOT be associated with the report.
# Current list of experiments is in EXPERIMENTS.txt.
experiment_base_url: "https://fyt-helper.thetechrobo.ca/experiment"
+30 −2
Original line number Diff line number Diff line
@@ -5,9 +5,20 @@ All the Service implementations live here.
import random, time, urllib.parse, aiohttp, asyncio
import typing_extensions as typing
from switch import Switch
from .types import YouTubeService, methods
from .types import YouTubeService, methods, experiment_base_url
from yarl import URL

async def submit_experiment(session: aiohttp.ClientSession, experiment_name: str, video_id: str):
    if experiment_base_url:
        report = {
            "experiment": experiment_name,
            "id": video_id,
        }
        try:
            await session.post(experiment_base_url, json=report)
        except Exception:
            pass

class YouTube(YouTubeService):
    """
    Checks if the video is still available on YouTube.
@@ -58,6 +69,23 @@ class WaybackMachine(YouTubeService):
            if redirect:
                u = URL(redirect)
                assert u.path != "/sry", "Redirected to sorry page. Is IA down?"
            fakeurl_archived = archived

        params = {"vtype": "youtube", "vid": id}
        async with session.get("https://web.archive.org/__wb/videoinfo", params=params, timeout=5) as response:
            viresp = await response.json()
            videoinfo_archived = bool(viresp.get("formats"))
            if videoinfo_archived:
                archived = True
        if fakeurl_archived != videoinfo_archived:
            await submit_experiment(session, "wb-index-weirdness", id)
            if videoinfo_archived:
                # TODO: better sorting system; right now while this is
                # an edge case I'm not going to bother, but if it ever is the default
                # this should be improved
                format = viresp['formats'][0]
                url, ts = format['url'], format['timestamp']
                lien = f"https://web.archive.org/web/{ts}/{url}"

        response2 = None
        url_formats = [
@@ -103,7 +131,7 @@ class WaybackMachine(YouTubeService):
                        lien = response2["archived_snapshots"]["closest"]["url"]
                        break

        rawraw = (redirect, response2)
        rawraw = (redirect, viresp, response2)
        return cls(
            archived=archived, capcount=int(archived), rawraw=rawraw, available=lien,
            lastupdated=time.time(), name=cls.getName(), note="", metaonly=ismeta,
+4 −1
Original line number Diff line number Diff line
@@ -19,6 +19,9 @@ with open('config.yml', 'r') as file:
    config_yml = yaml.safe_load(file)
    methods = config_yml["methods"]
    user_agent = config_yml.get("user_agent") # defaults to None if not set
    experiment_base_url = config_yml.get("experiment_base_url")
    if experiment_base_url:
        experiment_base_url = experiment_base_url.rstrip("/")

@dataclasses.dataclass
class Service(JSONDataclass):
@@ -33,7 +36,7 @@ class Service(JSONDataclass):
        lastupdated (int): The timestamp the data was retrieved from the server. Used internally to expire cache entries.
        name (str): The name of the service. Used in the UI.
        note (str): A footnote about the service. This could be different depending on conditions. For example, the Internet Archive has an extra passage if the item is dark. Used in the UI.
        rawraw (Any): The data used to check whether the video is archived on that particular service. For example, for GhostArchive, it would be the HTTP status code.
        rawraw (Any): The data used to check whether the video is archived on that particular service. For example, for GhostArchive, it would be the HTTP status code. The structure could change at any time.
        metaonly (bool): True if only the metadata is archived. This value should not be relied on!
        comments (bool): True if the comments are archived. This value should not be relied on!
        maybe_paywalled (bool): True if the service might require payment.