Commit 19a0f667 authored by John's avatar John
Browse files

First upload of automatic source download integration

parent 91b70dac
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@ result
ballontranslator/data/models
ballontranslator/data/testpacks/eng_dontupload
ballontranslator/data/testpacks/testpacks
ballontranslator/data/*.png
ballontranslator/ui/pagesources/projects
release

tmp.py
+5 −0
Original line number Diff line number Diff line
# List of known working sources
+ mangairo
+ asura scans
+ reaper scans
+ nhentai
 No newline at end of file
+144 −0
Original line number Diff line number Diff line
import requests
import undetected_chromedriver as uc
from undetected_chromedriver import ChromeOptions
from bs4 import BeautifulSoup
import re
import time
from constants import SOURCE_DOWNLOAD_PATH
from exceptions import ImagesNotFoundInRequest, NotValidUrl
import os


class SourceBase:
    def __init__(self):
        self.url: str = ''
        self.path: str = ''
        self.title: str = ''
        self.template: list[str] = ['cover']
        self.image_urls: list[str] = []
        self.last_page_num: int = 0

    def SetUrl(self, url):
        self.url = url

    def SetTitle(self, title):
        self.title = title

    def SaveNumberOfPages(self, path):
        #  clear file before saving last page number
        open(path, 'w').close()

        with open(path, 'w') as txt:
            txt.write(str(self.last_page_num))

    def ReturnNumberOfPages(self) -> int:
        return self.last_page_num

    def CheckFiles(self, path):

        #  read known page number
        try:
            with open(path, 'r') as txt:
                try:
                    self.last_page_num = txt.readlines()[0]
                except IndexError:
                    return False
        except FileNotFoundError:
            return False

        #  count images in directory
        files = os.listdir(self.path)
        number_of_images = 0
        for i in files:
            if '.jpg' in i:
                number_of_images += 1

        if number_of_images == int(self.last_page_num):
            return True
        else:
            return False

    def FetchImageUrls(self, force_redownload: bool = False):

        #  set download path
        if not self.title:

            #  filter url for illegal characters
            _url = self.url.translate({ord(c): None for c in '\/:*?"<>|'})
            self.path = rf'{SOURCE_DOWNLOAD_PATH}\{_url}'

        else:
            self.path = rf'{SOURCE_DOWNLOAD_PATH}\{self.title}'

        path_to_page_num = rf'{self.path}\pages.txt'

        #  check if the files are already downloaded
        are_downloaded = False
        if not os.path.exists(self.path):
            os.makedirs(self.path)
        elif os.path.exists(self.path) and force_redownload is False:
            are_downloaded = self.CheckFiles(path_to_page_num)

        if are_downloaded is False:

            #  initialize webdriver
            options = ChromeOptions()
            options.add_argument('headless')
            driver = uc.Chrome(options=options)

            #  load page and wait for cloudflare to pass
            driver.get(self.url)
            time.sleep(10)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            #  find all images and filter them
            _elements = soup.find_all('img')
            urls = [img['src'] for img in _elements]
            images = [k for k in urls if 'https' in k]
            temp_list = []

            #  filter images for only those with numbers at the end and makes sure there are no duplicates
            for i in images:

                try:
                    temp = (re.search('https://(.*)/(.*?)jpg', i)).group(2)

                    if any(k.isdigit() for k in temp) and temp not in temp_list and len(temp) < 10 and temp not in self.template:
                        i = i.replace(' ', '%20')
                        self.image_urls.append(i)

                    temp_list.append(temp)

                except AttributeError:
                    pass

            #  download images
            self.DownloadImages()

    def DownloadImages(self):
        n = 1

        for i in self.image_urls:
            img_data = requests.get(i).content

            with open(rf'{self.path}\{n:03}.jpg', 'wb') as image:
                image.write(img_data)

            n += 1
            #  Avoid IP ban
            time.sleep(1)

        self.last_page_num = len(self.image_urls)

        self.SaveNumberOfPages(rf'{self.path}\pages.txt')

    def run(self, url: str, force_redownload: bool, title: str = ''):
        self.SetUrl(url)
        if title:
            self.SetTitle(title)
        self.FetchImageUrls(force_redownload)


if __name__ == '__main__':
    Source = SourceBase()
    Source.run(url='https://nhentai.net/g/444882/', force_redownload=False, title='Pog')
+5 −0
Original line number Diff line number Diff line
import os.path as osp

UI_PATH = osp.dirname(osp.abspath(__file__))
PROGRAM_PATH = osp.dirname(UI_PATH)
SOURCE_DOWNLOAD_PATH = osp.join(PROGRAM_PATH, r"pagesources\projects")
 No newline at end of file
+40 −0
Original line number Diff line number Diff line
#  BaseError structure stolen from dmMaze
class BaseError(Exception):
    """
    base error structure class
    """

    def __init__(self, val, message):
        """
        @param val: actual value
        @param message: message shown to the user
        """
        self.val = val
        self.message = message
        super().__init__()

    def __str__(self):
        return "{} --> {}".format(self.val, self.message)


class NotValidUrl(BaseError):
    """
    exception thrown if the user enters an invalid url
    """

    def __init__(self,
                 val,
                 message='text must be a valid url, it must also include https://'):
        super(NotValidUrl, self).__init__(val, message)


class ImagesNotFoundInRequest(BaseError):
    """
    exception thrown if the program fails to locate images on a website
    """

    def __init__(self, val,
                 message='the specified website is not currently supported, '
                         'you can always suggest implementation of any website on github'):
        super(ImagesNotFoundInRequest, self).__init__(val, message)
Loading