Implemented downloads based on gallery-dl library (6fa6a253) · Commits · git-mirror / BallonsTranslator

.gitignore

+2 −1

Original line number	Diff line number	Diff line
		@@ -6,7 +6,8 @@ ballontranslator/data/models
		ballontranslator/data/testpacks/eng_dontupload
		ballontranslator/data/testpacks/testpacks
		ballontranslator/data/*.png
		ballontranslator/ui/pagesources/projects
		ballontranslator/dl/pagesources/gallery-dl
		ballontranslator/gallery-dl
		release

		tmp.py

ballontranslator/dl/pagesources/IMPLEMENTED_SOURCES.md

+1 −9

Original line number	Diff line number	Diff line
		# List of known working sources
		+ ~~mangairo~~ \| raised cloudflare protection
		+ asura scans
		+ reaper scans
		+ nhentai

		<span style="color:red"> <span style="font-size: medium; "> If you are experiencing issues with your source, it could indicate that it is utilizing a different structure or has raised its Cloudflare protection. </span> </span>

		Keep in mind that you can request implementation of any source, although there is a possibility of rejection.
		# Please refer to https://github.com/mikf/gallery-dl
		No newline at end of file

ballontranslator/dl/pagesources/init.py

+31 −151

Original line number	Diff line number	Diff line
		import shutil
		import requests
		import undetected_chromedriver as uc
		from selenium.webdriver.chrome.options import Options
		from bs4 import BeautifulSoup
		import re
		import time
		from .constants import SOURCE_DOWNLOAD_PATH
		from .exceptions import ImagesNotFoundInRequest, NotValidUrl
		from gallery_dl.job import DownloadJob
		from gallery_dl import config
		from ui.mainwindow import MainWindow
		from utils.logger import logger as LOGGER
		import os


		class SourceBase:
		def __init__(self):
		self.url: str = ''
		self.path: str = ''
		self.title: str = ''
		self.template: list[str] = ['cover']
		self.image_urls: list[str] = []
		self.last_page_num: int = 0

		def SetUrl(self, url):
		self.url = url
		self.CheckLink()

		def SetTitle(self, title):
		self.title = title

		def SaveNumberOfPages(self, path):
		# clear file before saving last page number
		open(path, 'w').close()

		with open(path, 'w') as txt:
		txt.write(str(self.last_page_num))

		def ReturnNumberOfPages(self) -> int:
		return self.last_page_num
		from ui.misc import ProgramConfig
		from ui.imgtrans_proj import ProjImgTrans

		class SourceDownload:
		def __init__(self, config: ProgramConfig, imgtrans_proj: ProjImgTrans, menu: MainWindow, args, *kwargs):
		super().__init__(args, *kwargs)
		self.config_pnl = config
		self.imgtrans_proj = imgtrans_proj
		self.menu = menu
		self.path = ''
		self.url = ''

		def ReturnFullPathToProject(self) -> str:
		return self.path

		def CheckLink(self):
		def ValidateUrl(self):
		if 'https://' not in self.url:
		self.url = 'https://' + self.url

		def CheckFiles(self, path):

		# read known page number
		try:
		with open(path, 'r') as txt:
		try:
		self.last_page_num = txt.readlines()[0]
		except IndexError:
		return False
		except FileNotFoundError:
		return False

		# count images in directory
		files = os.listdir(self.path)
		number_of_images = 0
		for i in files:
		if '.jpg' in i:
		number_of_images += 1

		if number_of_images == int(self.last_page_num):
		return True
		else:
		return False
		@staticmethod
		def ClearDirectory(path):
		filelist = [f for f in os.listdir(path) if f.endswith(".jpg")]
		for f in filelist:
		os.remove(os.path.join(path, f))

		def FetchImageUrls(self, force_redownload: bool = False):
		LOGGER.info('Scraping website for images')

		if not self.title:
		_url = self.url.translate({ord(c): None for c in '\./:*?"<>\|'})
		self.path = rf'{SOURCE_DOWNLOAD_PATH}\{_url}'

		else:
		self.path = rf'{SOURCE_DOWNLOAD_PATH}\{self.title}'

		path_to_page_num = rf'{self.path}\pages.txt'

		are_downloaded = False
		if not os.path.exists(self.path):
		os.makedirs(self.path)
		elif os.path.exists(self.path) and force_redownload is False:
		are_downloaded = self.CheckFiles(path_to_page_num)

		if are_downloaded is False:
		self.ClearDirectory(self.path)

		options = Options()
		# options.add_argument("--headless")
		driver = uc.Chrome(options=options)

		# wait for cloudflare to pass
		driver.get(self.url)
		time.sleep(10)
		soup = BeautifulSoup(driver.page_source, 'html.parser')
		driver.close()

		_elements = soup.find_all('img')
		urls = [img['src'] for img in _elements]
		images = [k for k in urls if 'https' in k]
		temp_list = []

		# filter images for only those with numbers at the end and makes sure there are no duplicates
		for i in images:

		try:
		temp = (re.search('https://(.)/(.?)jpg', i)).group(2)

		if any(k.isdigit() for k in temp) and temp not in temp_list and len(temp) < 10 and temp not in self.template:
		i = i.replace(' ', '%20')
		self.image_urls.append(i)

		temp_list.append(temp)

		except AttributeError:
		pass

		if not self.image_urls:
		raise ImagesNotFoundInRequest(self.image_urls)

		self.WebsiteExceptions()
		self.DownloadImages()

		def WebsiteExceptions(self):
		urls = self.image_urls
		if any('nhentai' in k for k in urls):

		for i, s in enumerate(urls):
		urls[i] = s.replace('https://t', 'https://i').replace('t.jpg', '.jpg')

		self.image_urls = urls

		def DownloadImages(self):
		n = 1
		LOGGER.info('Downloading images')

		for i in self.image_urls:
		img_data = requests.get(i, stream=True)
		def FetchImageUrls(self):
		config.load()
		job = DownloadJob(self.url)
		job.run()
		self.path = job.pathfmt.directory

		with open(rf'{self.path}\{n:03}.jpg', 'wb') as image:
		shutil.copyfileobj(img_data.raw, image)
		n += 1
		# Avoid IP ban
		time.sleep(1)
		def download_source(self):
		self.url = self.config_pnl.src_link_flag
		if self.url:
		LOGGER.info(f'Url set to {self.url}')

		self.last_page_num = len(self.image_urls)
		self.ValidateUrl()
		self.FetchImageUrls()

		self.SaveNumberOfPages(rf'{self.path}\pages.txt')
		proj_path = self.ReturnFullPathToProject()
		LOGGER.info(f'Project path set to {proj_path}')

		def run(self, url: str, force_redownload: bool, title: str = ''):
		self.SetUrl(url)
		if title:
		self.SetTitle(title)
		self.FetchImageUrls(force_redownload)
		if proj_path:
		self.menu.openDir(proj_path)
		LOGGER.info('Download complete')

ballontranslator/dl/pagesources/constants.py

deleted100644 → 0

+0 −5

Original line number	Diff line number	Diff line
		import os.path as osp

		UI_PATH = osp.dirname(osp.abspath(__file__))
		PROGRAM_PATH = osp.dirname(UI_PATH)
		SOURCE_DOWNLOAD_PATH = osp.join(PROGRAM_PATH, r"pagesources\projects")
		No newline at end of file

ballontranslator/dl/pagesources/exceptions.py

deleted100644 → 0

+0 −40

Original line number	Diff line number	Diff line
		# BaseError structure stolen from dmMaze
		class BaseError(Exception):
		"""
		base error structure class
		"""

		def __init__(self, val, message):
		"""
		@param val: actual value
		@param message: message shown to the user
		"""
		self.val = val
		self.message = message
		super().__init__()

		def __str__(self):
		return "{} --> {}".format(self.val, self.message)


		class NotValidUrl(BaseError):
		"""
		exception thrown if the user enters an invalid url
		"""

		def __init__(self,
		val,
		message='text must be a valid url, it must also include https://'):
		super(NotValidUrl, self).__init__(val, message)


		class ImagesNotFoundInRequest(BaseError):
		"""
		exception thrown if the program fails to locate images on a website
		"""

		def __init__(self, val,
		message='the specified website is not currently supported, '
		'you can always suggest implementation of any website on github'):
		super(ImagesNotFoundInRequest, self).__init__(val, message)