Commit 6fa6a253 authored by John's avatar John
Browse files

Implemented downloads based on gallery-dl library

parent 2dec7cb0
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -6,7 +6,8 @@ ballontranslator/data/models
ballontranslator/data/testpacks/eng_dontupload
ballontranslator/data/testpacks/testpacks
ballontranslator/data/*.png
ballontranslator/ui/pagesources/projects
ballontranslator/dl/pagesources/gallery-dl
ballontranslator/gallery-dl
release

tmp.py
+1 −9
Original line number Diff line number Diff line
# List of known working sources
+ ~~mangairo~~ | raised cloudflare protection 
+ asura scans
+ reaper scans
+ nhentai

<span style="color:red"> <span style="font-size: medium; "> If you are experiencing issues with your source, it could indicate that it is utilizing a different structure or has raised its Cloudflare protection. </span> </span>

Keep in mind that you can request implementation of any source, although there is a possibility of rejection.
# Please refer to https://github.com/mikf/gallery-dl
 No newline at end of file
+31 −151
Original line number Diff line number Diff line
import shutil
import requests
import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import time
from .constants import SOURCE_DOWNLOAD_PATH
from .exceptions import ImagesNotFoundInRequest, NotValidUrl
from gallery_dl.job import DownloadJob
from gallery_dl import config
from ui.mainwindow import MainWindow
from utils.logger import logger as LOGGER
import os


class SourceBase:
    def __init__(self):
        self.url: str = ''
        self.path: str = ''
        self.title: str = ''
        self.template: list[str] = ['cover']
        self.image_urls: list[str] = []
        self.last_page_num: int = 0

    def SetUrl(self, url):
        self.url = url
        self.CheckLink()

    def SetTitle(self, title):
        self.title = title

    def SaveNumberOfPages(self, path):
        #  clear file before saving last page number
        open(path, 'w').close()

        with open(path, 'w') as txt:
            txt.write(str(self.last_page_num))

    def ReturnNumberOfPages(self) -> int:
        return self.last_page_num
from ui.misc import ProgramConfig
from ui.imgtrans_proj import ProjImgTrans

class SourceDownload:
    def __init__(self, config: ProgramConfig, imgtrans_proj: ProjImgTrans, menu: MainWindow, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.config_pnl = config
        self.imgtrans_proj = imgtrans_proj
        self.menu = menu
        self.path = ''
        self.url = ''

    def ReturnFullPathToProject(self) -> str:
        return self.path

    def CheckLink(self):
    def ValidateUrl(self):
        if 'https://' not in self.url:
            self.url = 'https://' + self.url

    def CheckFiles(self, path):

        #  read known page number
        try:
            with open(path, 'r') as txt:
                try:
                    self.last_page_num = txt.readlines()[0]
                except IndexError:
                    return False
        except FileNotFoundError:
            return False

        #  count images in directory
        files = os.listdir(self.path)
        number_of_images = 0
        for i in files:
            if '.jpg' in i:
                number_of_images += 1

        if number_of_images == int(self.last_page_num):
            return True
        else:
            return False
    @staticmethod
    def ClearDirectory(path):
        filelist = [f for f in os.listdir(path) if f.endswith(".jpg")]
        for f in filelist:
            os.remove(os.path.join(path, f))

    def FetchImageUrls(self, force_redownload: bool = False):
        LOGGER.info('Scraping website for images')

        if not self.title:
            _url = self.url.translate({ord(c): None for c in '\./:*?"<>|'})
            self.path = rf'{SOURCE_DOWNLOAD_PATH}\{_url}'

        else:
            self.path = rf'{SOURCE_DOWNLOAD_PATH}\{self.title}'

        path_to_page_num = rf'{self.path}\pages.txt'

        are_downloaded = False
        if not os.path.exists(self.path):
            os.makedirs(self.path)
        elif os.path.exists(self.path) and force_redownload is False:
            are_downloaded = self.CheckFiles(path_to_page_num)

        if are_downloaded is False:
            self.ClearDirectory(self.path)

            options = Options()
            # options.add_argument("--headless")
            driver = uc.Chrome(options=options)

            #  wait for cloudflare to pass
            driver.get(self.url)
            time.sleep(10)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            driver.close()

            _elements = soup.find_all('img')
            urls = [img['src'] for img in _elements]
            images = [k for k in urls if 'https' in k]
            temp_list = []

            #  filter images for only those with numbers at the end and makes sure there are no duplicates
            for i in images:

                try:
                    temp = (re.search('https://(.*)/(.*?)jpg', i)).group(2)

                    if any(k.isdigit() for k in temp) and temp not in temp_list and len(temp) < 10 and temp not in self.template:
                        i = i.replace(' ', '%20')
                        self.image_urls.append(i)

                    temp_list.append(temp)

                except AttributeError:
                    pass

            if not self.image_urls:
                raise ImagesNotFoundInRequest(self.image_urls)

            self.WebsiteExceptions()
            self.DownloadImages()

    def WebsiteExceptions(self):
        urls = self.image_urls
        if any('nhentai' in k for k in urls):

            for i, s in enumerate(urls):
                urls[i] = s.replace('https://t', 'https://i').replace('t.jpg', '.jpg')

            self.image_urls = urls

    def DownloadImages(self):
        n = 1
        LOGGER.info('Downloading images')

        for i in self.image_urls:
            img_data = requests.get(i, stream=True)
    def FetchImageUrls(self):
        config.load()
        job = DownloadJob(self.url)
        job.run()
        self.path = job.pathfmt.directory

            with open(rf'{self.path}\{n:03}.jpg', 'wb') as image:
                shutil.copyfileobj(img_data.raw, image)
            n += 1
            #  Avoid IP ban
            time.sleep(1)
    def download_source(self):
        self.url = self.config_pnl.src_link_flag
        if self.url:
            LOGGER.info(f'Url set to {self.url}')

        self.last_page_num = len(self.image_urls)
            self.ValidateUrl()
            self.FetchImageUrls()

        self.SaveNumberOfPages(rf'{self.path}\pages.txt')
            proj_path = self.ReturnFullPathToProject()
            LOGGER.info(f'Project path set to {proj_path}')

    def run(self, url: str, force_redownload: bool, title: str = ''):
        self.SetUrl(url)
        if title:
            self.SetTitle(title)
        self.FetchImageUrls(force_redownload)
            if proj_path:
                self.menu.openDir(proj_path)
                LOGGER.info('Download complete')
+0 −5
Original line number Diff line number Diff line
import os.path as osp

UI_PATH = osp.dirname(osp.abspath(__file__))
PROGRAM_PATH = osp.dirname(UI_PATH)
SOURCE_DOWNLOAD_PATH = osp.join(PROGRAM_PATH, r"pagesources\projects")
 No newline at end of file
+0 −40
Original line number Diff line number Diff line
#  BaseError structure stolen from dmMaze
class BaseError(Exception):
    """
    base error structure class
    """

    def __init__(self, val, message):
        """
        @param val: actual value
        @param message: message shown to the user
        """
        self.val = val
        self.message = message
        super().__init__()

    def __str__(self):
        return "{} --> {}".format(self.val, self.message)


class NotValidUrl(BaseError):
    """
    exception thrown if the user enters an invalid url
    """

    def __init__(self,
                 val,
                 message='text must be a valid url, it must also include https://'):
        super(NotValidUrl, self).__init__(val, message)


class ImagesNotFoundInRequest(BaseError):
    """
    exception thrown if the program fails to locate images on a website
    """

    def __init__(self, val,
                 message='the specified website is not currently supported, '
                         'you can always suggest implementation of any website on github'):
        super(ImagesNotFoundInRequest, self).__init__(val, message)
Loading