Updated almost all functions to make everything work (835ad000) · Commits · git-mirror / BallonsTranslator

modules/ocr/ocr_paddle.py

+45 −40

Original line number	Diff line number	Diff line
		@@ -218,7 +218,7 @@ class PaddleOCRModule(OCRBase):
		try:
		result = self.model.ocr(cropped_img, det=True, rec=True, cls=self.use_angle_cls)

		# Извлечение сырого текста из результата OCR
		# Extract raw text from OCR result
		raw_texts = []
		if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list):
		for line in result[0]:
		@@ -229,7 +229,7 @@ class PaddleOCRModule(OCRBase):
		if self.debug_mode:
		self.logger.debug(f"Raw OCR text from the block ({x1}, {y1}, {x2}, {y2}): {raw_text}")

		# Обработка результата OCR
		# Process the OCR result
		text = self._process_result(result)

		if self.debug_mode:
		@@ -246,29 +246,6 @@ class PaddleOCRModule(OCRBase):
		self.logger.warning('Invalid text block coordinates for target image')
		blk.text = ''

		def _apply_text_case(self, text: str) -> str:
		if self.text_case == 'Uppercase':
		return text.upper()
		elif self.text_case == 'Capitalize Sentences':
		return self._capitalize_sentences(text)
		elif self.text_case == 'Lowercase':
		return text.lower()
		else:
		return text # Без изменений, если режим не распознан

		def _capitalize_sentences(self, text: str) -> str:
		def process_sentence(sentence):
		words = sentence.split()
		if not words:
		return ''
		if len(words) == 1:
		return words[0].capitalize()
		else:
		return ' '.join([words[0].capitalize()] + [word.lower() for word in words[1:]])

		sentences = re.split(r'(?<=[.!?…])\s+', text)
		return ' '.join(process_sentence(sentence) for sentence in sentences)

		def _process_result(self, result):
		try:
		if not result or result[0] is None:
		@@ -277,36 +254,64 @@ class PaddleOCRModule(OCRBase):
		if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list):
		result = result[0]

		texts = []
		raw_texts = []
		for line in result:
		if isinstance(line, list) and len(line) > 1 and isinstance(line[1], (list, tuple)) and len(line[1]) > 0:
		text = line[1][0]
		text = re.sub(r'-(?!\w)', '', text)
		text = re.sub(r'\s+', ' ', text)
		text = self._apply_text_case(text) # Применяем выбранный регистр
		text = self._apply_punctuation_and_spacing(text)
		texts.append(text.strip())

		if not texts:
		return ''
		raw_texts.append(text)

		# Обработка формата вывода
		# Depending on the output_format, we concatenate the lines
		if self.output_format == 'Single Line':
		text = ' '.join(texts)
		joined_text = ' '.join(raw_texts)
		# Text cleaning
		joined_text = re.sub(r'-(?!\w)', '', joined_text)
		joined_text = re.sub(r'\s+', ' ', joined_text)
		elif self.output_format == 'As Recognized':
		text = '\n'.join(texts)
		joined_text = ' '.join(raw_texts) # Combine with spaces to create a single text
		# Clean up text, preserve line breaks
		joined_text = re.sub(r'-(?!\w)', '', joined_text)
		joined_text = re.sub(r'\s+', ' ', joined_text)
		else:
		text = ' '.join(texts) # По умолчанию
		joined_text = ' '.join(raw_texts)
		joined_text = re.sub(r'-(?!\w)', '', joined_text)
		joined_text = re.sub(r'\s+', ' ', joined_text)

		# Apply case conversion to all text
		processed_text = self._apply_text_case(joined_text)
		processed_text = self._apply_punctuation_and_spacing(processed_text)

		if self.debug_mode:
		self.logger.debug(f"Final processed text: {text}")
		self.logger.debug(f"Final processed text: {processed_text}")

		return text
		return processed_text
		except Exception as e:
		if self.debug_mode:
		self.logger.error(f"Error processing OCR result: {str(e)}")
		return ''

		def _apply_text_case(self, text: str) -> str:
		if self.text_case == 'Uppercase':
		return text.upper()
		elif self.text_case == 'Capitalize Sentences':
		return self._capitalize_sentences(text)
		elif self.text_case == 'Lowercase':
		return text.lower()
		else:
		return text # No change if the mode is not recognized

		def _capitalize_sentences(self, text: str) -> str:
		def process_sentence(sentence):
		words = sentence.split()
		if not words:
		return ''
		if len(words) == 1:
		return words[0].capitalize()
		else:
		return ' '.join([words[0].capitalize()] + [word.lower() for word in words[1:]])

		# We divide into sentences only by punctuation marks
		sentences = re.split(r'(?<=[.!?…])\s+', text)
		return ' '.join(process_sentence(sentence) for sentence in sentences)

		def _apply_punctuation_and_spacing(self, text: str) -> str:
		text = re.sub(r'\s+([,.!?…])', r'\1', text)