First upload of automatic source download integration (19a0f667) · Commits · git-mirror / BallonsTranslator

.gitignore

+2 −0

Original line number	Diff line number	Diff line
		@@ -5,6 +5,8 @@ result
		ballontranslator/data/models
		ballontranslator/data/testpacks/eng_dontupload
		ballontranslator/data/testpacks/testpacks
		ballontranslator/data/*.png
		ballontranslator/ui/pagesources/projects
		release

		tmp.py

ballontranslator/dl/pagesources/IMPLEMENTED_SOURCES.md

0 → 100644

+5 −0

Original line number	Diff line number	Diff line
		# List of known working sources
		+ mangairo
		+ asura scans
		+ reaper scans
		+ nhentai
		No newline at end of file

ballontranslator/dl/pagesources/init.py

0 → 100644

+144 −0

Original line number	Diff line number	Diff line
		import requests
		import undetected_chromedriver as uc
		from undetected_chromedriver import ChromeOptions
		from bs4 import BeautifulSoup
		import re
		import time
		from constants import SOURCE_DOWNLOAD_PATH
		from exceptions import ImagesNotFoundInRequest, NotValidUrl
		import os


		class SourceBase:
		def __init__(self):
		self.url: str = ''
		self.path: str = ''
		self.title: str = ''
		self.template: list[str] = ['cover']
		self.image_urls: list[str] = []
		self.last_page_num: int = 0

		def SetUrl(self, url):
		self.url = url

		def SetTitle(self, title):
		self.title = title

		def SaveNumberOfPages(self, path):
		# clear file before saving last page number
		open(path, 'w').close()

		with open(path, 'w') as txt:
		txt.write(str(self.last_page_num))

		def ReturnNumberOfPages(self) -> int:
		return self.last_page_num

		def CheckFiles(self, path):

		# read known page number
		try:
		with open(path, 'r') as txt:
		try:
		self.last_page_num = txt.readlines()[0]
		except IndexError:
		return False
		except FileNotFoundError:
		return False

		# count images in directory
		files = os.listdir(self.path)
		number_of_images = 0
		for i in files:
		if '.jpg' in i:
		number_of_images += 1

		if number_of_images == int(self.last_page_num):
		return True
		else:
		return False

		def FetchImageUrls(self, force_redownload: bool = False):

		# set download path
		if not self.title:

		# filter url for illegal characters
		_url = self.url.translate({ord(c): None for c in '\/:*?"<>\|'})
		self.path = rf'{SOURCE_DOWNLOAD_PATH}\{_url}'

		else:
		self.path = rf'{SOURCE_DOWNLOAD_PATH}\{self.title}'

		path_to_page_num = rf'{self.path}\pages.txt'

		# check if the files are already downloaded
		are_downloaded = False
		if not os.path.exists(self.path):
		os.makedirs(self.path)
		elif os.path.exists(self.path) and force_redownload is False:
		are_downloaded = self.CheckFiles(path_to_page_num)

		if are_downloaded is False:

		# initialize webdriver
		options = ChromeOptions()
		options.add_argument('headless')
		driver = uc.Chrome(options=options)

		# load page and wait for cloudflare to pass
		driver.get(self.url)
		time.sleep(10)
		soup = BeautifulSoup(driver.page_source, 'html.parser')

		# find all images and filter them
		_elements = soup.find_all('img')
		urls = [img['src'] for img in _elements]
		images = [k for k in urls if 'https' in k]
		temp_list = []

		# filter images for only those with numbers at the end and makes sure there are no duplicates
		for i in images:

		try:
		temp = (re.search('https://(.)/(.?)jpg', i)).group(2)

		if any(k.isdigit() for k in temp) and temp not in temp_list and len(temp) < 10 and temp not in self.template:
		i = i.replace(' ', '%20')
		self.image_urls.append(i)

		temp_list.append(temp)

		except AttributeError:
		pass

		# download images
		self.DownloadImages()

		def DownloadImages(self):
		n = 1

		for i in self.image_urls:
		img_data = requests.get(i).content

		with open(rf'{self.path}\{n:03}.jpg', 'wb') as image:
		image.write(img_data)

		n += 1
		# Avoid IP ban
		time.sleep(1)

		self.last_page_num = len(self.image_urls)

		self.SaveNumberOfPages(rf'{self.path}\pages.txt')

		def run(self, url: str, force_redownload: bool, title: str = ''):
		self.SetUrl(url)
		if title:
		self.SetTitle(title)
		self.FetchImageUrls(force_redownload)


		if __name__ == '__main__':
		Source = SourceBase()
		Source.run(url='https://nhentai.net/g/444882/', force_redownload=False, title='Pog')

ballontranslator/dl/pagesources/constants.py

0 → 100644

+5 −0

Original line number	Diff line number	Diff line
		import os.path as osp

		UI_PATH = osp.dirname(osp.abspath(__file__))
		PROGRAM_PATH = osp.dirname(UI_PATH)
		SOURCE_DOWNLOAD_PATH = osp.join(PROGRAM_PATH, r"pagesources\projects")
		No newline at end of file

ballontranslator/dl/pagesources/exceptions.py

0 → 100644

+40 −0

Original line number	Diff line number	Diff line
		# BaseError structure stolen from dmMaze
		class BaseError(Exception):
		"""
		base error structure class
		"""

		def __init__(self, val, message):
		"""
		@param val: actual value
		@param message: message shown to the user
		"""
		self.val = val
		self.message = message
		super().__init__()

		def __str__(self):
		return "{} --> {}".format(self.val, self.message)


		class NotValidUrl(BaseError):
		"""
		exception thrown if the user enters an invalid url
		"""

		def __init__(self,
		val,
		message='text must be a valid url, it must also include https://'):
		super(NotValidUrl, self).__init__(val, message)


		class ImagesNotFoundInRequest(BaseError):
		"""
		exception thrown if the program fails to locate images on a website
		"""

		def __init__(self, val,
		message='the specified website is not currently supported, '
		'you can always suggest implementation of any website on github'):
		super(ImagesNotFoundInRequest, self).__init__(val, message)