diff --git a/json-md-yaml-translator.py b/json-md-yaml-translator.py new file mode 100644 index 0000000..931d621 --- /dev/null +++ b/json-md-yaml-translator.py @@ -0,0 +1,818 @@ +import re +import os +import json +import logging +import argparse +import yaml +from typing import List, Dict, Optional, Union, Any, Tuple +from pathlib import Path +from openai import OpenAI +import langdetect + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("ContentTranslator") + +# Dictionary of supported languages and their codes +SUPPORTED_LANGUAGES = { + 'arabic': 'ar', # granite-3.1-8b-instruct + 'chinese': 'zh', # granite-3.1-8b-instruct + 'czech': 'cs', # granite-3.1-8b-instruct + 'dutch': 'nl', # granite-3.1-8b-instruct + 'english': 'en', # granite-3.1-8b-instruct + 'french': 'fr', # granite-3.1-8b-instruct + 'german': 'de', # granite-3.1-8b-instruct + 'hindi': 'hi', + 'italian': 'it', # granite-3.1-8b-instruct + 'japanese': 'ja', # granite-3.1-8b-instruct + 'korean': 'ko', # granite-3.1-8b-instruct + 'portuguese': 'pt', # granite-3.1-8b-instruct + 'russian': 'ru', + 'spanish': 'es', # granite-3.1-8b-instruct + 'swedish': 'sv', + 'turkish': 'tr', +} + + +class ContentTranslator: + def __init__( + self, + api_key: str = "no-key", + base_url: str = "http://localhost:11434/v1/", + model: str = "granite3.1-dense:latest", + source_lang: Optional[str] = None, + target_lang: Optional[str] = 'en' + ): + """ + Initialize the translator with API credentials and endpoint configuration. + + Args: + api_key: API key for the translation service + base_url: Base URL for the API endpoint + model: Model name to use for translation + source_lang: Optional source language code or name (e.g., 'es' or 'spanish') + target_lang: Target language code or name (e.g., 'fr' or 'french'), defaults to English + """ + self.client = OpenAI( + api_key=api_key, + base_url=base_url + ) + self.model = model + self.source_lang = self._normalize_language_code( + source_lang) if source_lang else None + self.target_lang = self._normalize_language_code( + target_lang) if target_lang else 'en' + logger.info(f"Initialized translator with model: {model}") + logger.info(f"Using API endpoint: {base_url}") + if self.source_lang: + logger.info( + f"Source language explicitly set to: {self.source_lang}") + logger.info(f"Target language set to: {self.target_lang}") + + def _normalize_language_code(self, lang: str) -> str: + """ + Normalize language input to standard language code. + + Args: + lang: Language name or code (e.g., 'japanese' or 'ja') + + Returns: + Normalized language code + """ + if not lang: + return None + + lang = lang.lower() + + # If it's already a valid language code, return it + if lang in SUPPORTED_LANGUAGES.values(): + return lang + + # If it's a language name, convert to code + if lang in SUPPORTED_LANGUAGES: + return SUPPORTED_LANGUAGES[lang] + + # If we can't normalize it, log a warning and return as-is + logger.warning(f"Unrecognized language specification: {lang}") + return lang + + def _get_language_name(self, lang_code: str) -> str: + """ + Get the language name from a language code. + + Args: + lang_code: Language code (e.g., 'ja') + + Returns: + Language name (e.g., 'Japanese') + """ + for name, code in SUPPORTED_LANGUAGES.items(): + if code == lang_code: + return name.capitalize() + return lang_code.upper() + + def read_file(self, file_path: str) -> str: + """Read the content of a file.""" + with open(file_path, 'r', encoding='utf-8') as file: + return file.read() + + def _is_yaml_file(self, file_path: str) -> bool: + """Check if a file is a YAML file based on extension.""" + return file_path.lower().endswith(('.yaml', '.yml')) + + def _is_jsonl_file(self, file_path: str) -> bool: + """Check if a file is a JSONL file based on extension.""" + return file_path.lower().endswith(('.jsonl', '.ndjson')) + + def split_into_blocks(self, content: str) -> List[Dict[str, str]]: + """ + Split markdown content into individual blocks (paragraphs, headers, tables, or lists). + + Args: + content: The markdown content as a string + + Returns: + List of dictionaries containing block type and content + """ + blocks = [] + current_lines = [] + current_type = "paragraph" + + def is_table_line(line: str) -> bool: + """Check if a line is part of a markdown table.""" + stripped = line.strip() + return stripped.startswith('|') or stripped.startswith('+-') or stripped.startswith('|-') + + def is_list_line(line: str) -> bool: + """Check if a line is part of a markdown list.""" + stripped = line.strip() + # Match numbered lists (1., 2., etc.) or bullet lists (-, *, +) + return bool(re.match(r'^\d+\.|\s*[-*+]', stripped)) + + def is_indented(line: str) -> bool: + """Check if a line is indented (part of a list or code block).""" + return line.startswith(' ') or line.startswith('\t') + + def add_block(lines, type_="paragraph"): + if lines: + content = '\n'.join(lines).strip() + if content: # Only add non-empty blocks + blocks.append({ + "type": type_, + "content": content + }) + + lines = content.split('\n') + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + + # Handle headers + if stripped.startswith('#'): + add_block(current_lines, current_type) + current_lines = [line] + add_block(current_lines, "header") + current_lines = [] + current_type = "paragraph" + + # Handle tables + elif is_table_line(line): + if current_type != "table": + add_block(current_lines, current_type) + current_lines = [] + current_type = "table" + current_lines.append(line) + + # Handle lists + elif is_list_line(line) or (current_type == "list" and (stripped or is_indented(line))): + if current_type != "list": + add_block(current_lines, current_type) + current_lines = [] + current_type = "list" + current_lines.append(line) + + # Handle code blocks + elif stripped.startswith('```'): + if current_type != "code": + add_block(current_lines, current_type) + current_lines = [] + current_type = "code" + current_lines.append(line) + # Skip until we find the closing code block + if len(current_lines) == 1: # Opening fence + i += 1 + while i < len(lines) and not lines[i].strip().startswith('```'): + current_lines.append(lines[i]) + i += 1 + if i < len(lines): + current_lines.append(lines[i]) # Add closing fence + + # Handle empty lines + elif not stripped: + if current_type in ["table", "list", "code"]: + # For tables, lists and code blocks, preserve empty lines + current_lines.append(line) + else: + # For paragraphs, use empty lines as separators + add_block(current_lines, current_type) + current_lines = [] + current_type = "paragraph" + + # Handle regular paragraph lines + else: + if current_type in ["table", "list", "code"]: + # Check if we're actually continuing the special block + if (current_type == "table" and not is_table_line(line)) or \ + (current_type == "list" and not (is_list_line(line) or is_indented(line))): + add_block(current_lines, current_type) + current_lines = [] + current_type = "paragraph" + current_lines.append(line) + + i += 1 + + # Add the final block + add_block(current_lines, current_type) + + return blocks + + def split_yaml_into_blocks(self, content: str) -> List[Dict[str, Any]]: + """ + Split YAML content into blocks, preserving structure. + + Args: + content: YAML content as a string + + Returns: + List of dictionaries with translatable parts and their structure + """ + blocks = [] + + try: + # Parse the YAML content + yaml_data = yaml.safe_load(content) + + # Process the YAML data recursively + self._process_yaml_node(yaml_data, blocks) + + return blocks + except yaml.YAMLError as e: + logger.error(f"Error parsing YAML content: {str(e)}") + # If parsing fails, treat as a single block + blocks.append({ + "type": "yaml_content", + "content": content, + "structure": None + }) + return blocks + + def split_jsonl_into_blocks(self, content: str) -> List[Dict[str, Any]]: + """ + Split JSONL content into blocks for translation + + Args: + content: JSONL content as a string + + Returns: + List of dictionaries with translatable parts and their structure + """ + blocks = [] + + try: + # Process each line as a separate JSON object + for i, line in enumerate(content.strip().split('\n')): + if not line.strip(): + continue + + try: + # Parse the JSON object + json_obj = json.loads(line) + + # Process each JSON object as a separate block with its line number + line_blocks = [] + self._process_json_node(json_obj, line_blocks, []) + + # Add line number information to each block + for block in line_blocks: + block["line_number"] = i + blocks.append(block) + + except json.JSONDecodeError as e: + logger.error(f"Error parsing JSON at line {i+1}: {str(e)}") + # Add the whole line as a single non-translatable block + blocks.append({ + "type": "json_error", + "content": line, + "line_number": i + }) + + return blocks + + except Exception as e: + logger.error(f"Error processing JSONL file: {str(e)}") + # If parsing fails completely, return the whole content as a block + blocks.append({ + "type": "jsonl_content", + "content": content, + "structure": None + }) + return blocks + + def _process_json_node(self, node: Any, blocks: List[Dict[str, Any]], path: List[str] = None) -> None: + """ + Process a JSON node recursively, extracting translatable strings. + + Args: + node: The current JSON node + blocks: List to store translatable blocks + path: Current path in the JSON structure + """ + if path is None: + path = [] + + if isinstance(node, dict): + # Process dictionary + for key, value in node.items(): + new_path = path + [key] + if isinstance(value, str) and value.strip(): + # Translatable leaf node (only non-empty strings) + blocks.append({ + "type": "json_value", + "content": value, + "path": new_path, + "key": key + }) + elif isinstance(value, (dict, list)): + # Recursively process nested structures + self._process_json_node(value, blocks, new_path) + + elif isinstance(node, list): + # Process list + for i, item in enumerate(node): + new_path = path + [str(i)] + if isinstance(item, str) and item.strip(): + # Translatable leaf node (only non-empty strings) + blocks.append({ + "type": "json_value", + "content": item, + "path": new_path, + "index": i + }) + elif isinstance(item, (dict, list)): + # Recursively process nested structures + self._process_json_node(item, blocks, new_path) + + def _process_yaml_node(self, node: Any, blocks: List[Dict[str, Any]], path: List[str] = None) -> None: + """ + Process a YAML node recursively, extracting translatable strings. + + Args: + node: The current YAML node + blocks: List to store translatable blocks + path: Current path in the YAML structure + """ + if path is None: + path = [] + + if isinstance(node, dict): + # Process dictionary + for key, value in node.items(): + new_path = path + [key] + if isinstance(value, (str, int, float, bool)) and isinstance(value, str): + # Translatable leaf node + blocks.append({ + "type": "yaml_value", + "content": value, + "path": new_path, + "key": key + }) + elif isinstance(value, (dict, list)): + # Recursively process nested structures + self._process_yaml_node(value, blocks, new_path) + + elif isinstance(node, list): + # Process list + for i, item in enumerate(node): + new_path = path + [str(i)] + if isinstance(item, (str, int, float, bool)) and isinstance(item, str): + # Translatable leaf node + blocks.append({ + "type": "yaml_value", + "content": item, + "path": new_path, + "index": i + }) + elif isinstance(item, (dict, list)): + # Recursively process nested structures + self._process_yaml_node(item, blocks, new_path) + + def detect_language(self, text: str) -> str: + """ + Detect the language of a text while ignoring markdown formatting. + Returns explicitly set language if specified, otherwise attempts detection. + + Args: + text: Text to detect language from + + Returns: + Language code (e.g., 'en' for English) + """ + # If source language is explicitly set, use it + if self.source_lang: + return self.source_lang + + try: + # Remove markdown formatting for better language detection + cleaned_text = re.sub(r'[#*_`|[-\]()]+', ' ', text) + cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() + + if not cleaned_text: # If text is empty or only contains markdown + return 'unknown' + + return langdetect.detect(cleaned_text) + except langdetect.LangDetectException: + return 'unknown' + + def _extract_code_comments(self, code: str) -> List[Dict[str, str]]: + """ + Extract comments from code while preserving their position and format. + + Args: + code: String containing code with comments + + Returns: + List of dictionaries containing comment info (text, start_pos, end_pos, type) + """ + comments = [] + lines = code.split('\n') + current_pos = 0 + + def is_in_string(line: str, pos: int) -> bool: + """Check if position is inside a string literal.""" + in_single = False + in_double = False + escaped = False + + for i in range(pos): + if line[i] == '\\': + escaped = not escaped + continue + + if not escaped: + if line[i] == "'": + if not in_double: + in_single = not in_single + elif line[i] == '"': + if not in_single: + in_double = not in_double + + escaped = False + + return in_single or in_double + + i = 0 + while i < len(lines): + line = lines[i] + line_start = current_pos + + # Handle single-line comments + if '//' in line: # JavaScript/C-style + pos = line.index('//') + if not is_in_string(line, pos): + comments.append({ + 'text': line[pos:], + 'start_pos': line_start + pos, + 'end_pos': line_start + len(line), + 'type': 'single' + }) + + elif '#' in line: # Python-style + pos = line.index('#') + if not is_in_string(line, pos): + comments.append({ + 'text': line[pos:], + 'start_pos': line_start + pos, + 'end_pos': line_start + len(line), + 'type': 'single' + }) + + # Handle multi-line comments + elif '/*' in line: # JavaScript/C-style + pos = line.index('/*') + if not is_in_string(line, pos): + comment_text = [line[pos:]] + comment_start = line_start + pos + i += 1 + + # Collect comment lines until we find the closing */ + while i < len(lines): + if '*/' in lines[i]: + end_pos = lines[i].index('*/') + 2 + comment_text.append(lines[i][:end_pos]) + comments.append({ + 'text': '\n'.join(comment_text), + 'start_pos': comment_start, + 'end_pos': current_pos + len(lines[i]), + 'type': 'multi' + }) + break + else: + comment_text.append(lines[i]) + i += 1 + current_pos += len(lines[i]) + 1 + + elif '"""' in line or "'''" in line: # Python docstring + if line.count('"""') == 1 or line.count("'''") == 1: # Opening quote + marker = '"""' if '"""' in line else "'''" + pos = line.index(marker) + if not is_in_string(line[:pos], pos): + comment_text = [line[pos:]] + comment_start = line_start + pos + i += 1 + + # Collect docstring lines until we find the closing quotes + while i < len(lines): + if marker in lines[i]: + end_pos = lines[i].index(marker) + 3 + comment_text.append(lines[i][:end_pos]) + comments.append({ + 'text': '\n'.join(comment_text), + 'start_pos': comment_start, + 'end_pos': current_pos + end_pos, + 'type': 'docstring' + }) + break + else: + comment_text.append(lines[i]) + i += 1 + current_pos += len(lines[i]) + 1 + + current_pos += len(line) + 1 + i += 1 + + return comments + + def translate_jsonl_block(self, block: Dict[str, Any]) -> Dict[str, Any]: + """ + Translate a JSONL value block. + + Args: + block: Dictionary with JSON value and structure info + + Returns: + Dictionary with translated content and original structure info + """ + # Skip non-translatable blocks + if block["type"] not in ["json_value"]: + return block + + content = block["content"] + + # Don't translate if it's not a string or empty + if not isinstance(content, str) or not content.strip(): + return block + + # Get the language (either specified or detected) + detected_lang = self.detect_language(content) + + # If it's already in the target language or can't be detected, return original + if detected_lang == self.target_lang or (detected_lang == 'unknown' and not self.source_lang): + return block + + # Get language names for clearer prompts + source_lang_name = self._get_language_name( + detected_lang) if detected_lang != 'unknown' else "the source language" + target_lang_name = self._get_language_name(self.target_lang) + + # Prepare translation prompt + system_prompt = f"""You are a JSON content translator. + + Rules: + 1. Translate the text from {source_lang_name} to {target_lang_name} + 2. Preserve any special formatting, variables, or placeholders exactly as they appear + 3. Return ONLY the translated text without any additional text + 4. Keep line breaks exactly as they appear in the original + 5. DO NOT include tags like "" or "" in your response + 6. Respond ONLY with the translated text and nothing else + + For this {block["type"]}: + {special_instructions.get(block["type"], "")} in the original + 5. Maintain the exact same meaning in the translation + 6. Do not add any quotes, brackets, braces, or other JSON syntax - just translate the text content + """ + + user_prompt = f"""Translate this JSON value from {source_lang_name} to {target_lang_name}, preserving all formatting and variables: + + + {content} + + + IMPORTANT: Return only the translated text. Do not include any tags, explanations, or other markers in your response. + Preserve all variable placeholders, brackets, braces like {{variable}}, and special characters exactly as they appear. + """ + + try: + # Make the API call + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.3 + ) + + translated_text = response.choices[0].message.content.strip() + + # Clean up any unwanted elements from the response + translated_text = self._clean_response(translated_text) + + # Return updated block + translated_block = block.copy() + translated_block["content"] = translated_text + + return translated_block + + except Exception as e: + logger.error(f"Error translating JSON block: {str(e)}") + return block # Return original block if translation fails + + def translate_yaml_block(self, block: Dict[str, Any]) -> Dict[str, Any]: + """ + Translate a YAML value block. + + Args: + block: Dictionary with YAML value and structure info + + Returns: + Dictionary with translated content and original structure info + """ + content = block["content"] + + # Don't translate if it's not a string + if not isinstance(content, str) or not content.strip(): + return block + + # Get the language (either specified or detected) + detected_lang = self.detect_language(content) + + # If it's already in the target language or can't be detected, return original + if detected_lang == self.target_lang or (detected_lang == 'unknown' and not self.source_lang): + return block + + # Get language names for clearer prompts + source_lang_name = self._get_language_name( + detected_lang) if detected_lang != 'unknown' else "the source language" + target_lang_name = self._get_language_name(self.target_lang) + + # Prepare translation prompt + system_prompt = f"""You are a YAML content translator. + + Rules: + 1. Translate the text from {source_lang_name} to {target_lang_name} + 2. Preserve any special formatting, variables, or placeholders exactly as they appear + 3. Return ONLY the translated text without any additional text + 4. Keep line breaks exactly as they appear in the original + 5. Maintain the exact same meaning in the translation + """ + + user_prompt = f"""Translate this YAML value from {source_lang_name} to {target_lang_name}, preserving all formatting and variables: + + + {content} + + + IMPORTANT: Return only the translated text. Do not include any tags, explanations, or other markers in your response. + Preserve all variable placeholders, brackets, braces like {{variable}}, and special characters exactly as they appear. + """ + + try: + # Make the API call + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.3 + ) + + translated_text = response.choices[0].message.content.strip() + + # Clean up any unwanted elements from the response + translated_text = self._clean_response(translated_text) + + # Return updated block + translated_block = block.copy() + translated_block["content"] = translated_text + + return translated_block + + except Exception as e: + logger.error(f"Error translating YAML block: {str(e)}") + return block # Return original block if translation fails + + def translate_block(self, block: Dict[str, str]) -> Dict[str, str]: + """ + Translate a single block using the configured API. + + Args: + block: Dictionary containing block type and content + + Returns: + Dictionary with translated content + """ + # For code blocks, only translate comments + if block["type"] == "code": + # Extract the actual code content (remove the ```language and ``` markers) + code_lines = block["content"].split('\n') + if len(code_lines) >= 2: # Valid code block with markers + code_content = '\n'.join(code_lines[1:-1]) # Remove markers + code_language = code_lines[0].replace('```', '').strip() + + # Extract comments + comments = self._extract_code_comments(code_content) + if not comments: # No comments to translate + return block + + # Translate each comment + translated_code = code_content + # Process from end to avoid position shifts + for comment in reversed(comments): + comment_text = comment['text'] + detected_lang = self.detect_language(comment_text) + + # Only translate if source and target languages are different + if detected_lang != self.target_lang and detected_lang != 'unknown': + # Prepare translation prompt for comment + target_lang_name = self._get_language_name( + self.target_lang) + system_prompt = f"""You are a code comment translator. + Translate only the comment text from {self._get_language_name(detected_lang)} to {target_lang_name} while preserving any comment markers and formatting. + If the comment contains code examples, variable names, or function names, keep those unchanged. + Preserve any special comment markers (e.g., @param, @return, TODO:, FIXME:, etc.).""" + + user_prompt = f"Translate this code comment to {target_lang_name}:\n\n{comment_text}" + + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.3 + ) + + translated_comment = response.choices[0].message.content.strip( + ) + # Replace the comment in the code while preserving indentation + translated_code = ( + translated_code[:comment['start_pos']] + + translated_comment + + translated_code[comment['end_pos']:] + ) + except Exception as e: + logger.error( + f"Error translating comment: {str(e)}") + + # Reconstruct the code block with markers + return { + "type": "code", + "content": f"```{code_language}\n{translated_code}\n```" + } + + # For non-code blocks, handle regular markdown translation + special_instructions = { + "table": "- Output the table with exact same structure, only translated content in cells\n- Keep all | and - characters in their exact positions", + "list": "- Output the list with exact same markers and indentation\n- Keep all list markers (-, *, +, numbers) exactly as they appear", + "code": "- Output the code block with exact same structure\n- Only translate comments, keep code unchanged", + "header": "- Output the header with exact same # symbols\n- Keep same number of # characters at the start", + "paragraph": "- Output the paragraph with exact same inline formatting\n- Keep all *bold*, _italic_, [links](urls) exactly as they appear" + } + + # Get the language (either specified or detected) + detected_lang = self.detect_language(block["content"]) + logger.debug( + f"Language: {detected_lang} ({'specified' if self.source_lang else 'detected'})") + + # If it's already in the target language or can't be detected, return original + if detected_lang == self.target_lang or (detected_lang == 'unknown' and not self.source_lang): + logger.debug( + f"Text is already in {self._get_language_name(self.target_lang)} or language cannot be detected. Skipping translation.") + return block + + # Get language names for clearer prompts + source_lang_name = self._get_language_name( + detected_lang) if detected_lang != 'unknown' else "the source language" + target_lang_name = self._get_language_name(self.target_lang) + + # Updated system prompt to be more explicit about not including the markers + system_prompt = f"""You are a markdown translator that only outputs the translated content. + + Rules: + 1. Translate the content between the tags "" and "" from {source_lang_name} to {target_lang_name} + 2. Keep all markdown formatting exactly as is + 3. Return ONLY the translated content without ANY additional text + 4. Keep line breaks exactly as they appear diff --git a/markdown-yaml-translator.py b/markdown-yaml-translator.py new file mode 100644 index 0000000..939ef3d --- /dev/null +++ b/markdown-yaml-translator.py @@ -0,0 +1,3190 @@ +import re +import os +import json +import logging +import argparse +import yaml +from typing import List, Dict, Optional, Union, Any, Tuple +from pathlib import Path +from openai import OpenAI +import langdetect + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("ContentTranslator") + +# Dictionary of supported languages and their codes +SUPPORTED_LANGUAGES = { + 'arabic': 'ar', # granite-3.1-8b-instruct + 'chinese': 'zh', # granite-3.1-8b-instruct + 'czech': 'cs', # granite-3.1-8b-instruct + 'dutch': 'nl', # granite-3.1-8b-instruct + 'english': 'en', # granite-3.1-8b-instruct + 'french': 'fr', # granite-3.1-8b-instruct + 'german': 'de', # granite-3.1-8b-instruct + 'hindi': 'hi', + 'italian': 'it', # granite-3.1-8b-instruct + 'japanese': 'ja', # granite-3.1-8b-instruct + 'korean': 'ko', # granite-3.1-8b-instruct + 'portuguese': 'pt', # granite-3.1-8b-instruct + 'russian': 'ru', + 'spanish': 'es', # granite-3.1-8b-instruct + 'swedish': 'sv', + 'turkish': 'tr', +} + + +class ContentTranslator: + def __init__( + self, + api_key: str = "no-key", + base_url: str = "http://localhost:11434/v1/", + model: str = "granite3.1-dense:latest", + source_lang: Optional[str] = None, + target_lang: Optional[str] = 'en' + ): + """ + Initialize the translator with API credentials and endpoint configuration. + + Args: + api_key: API key for the translation service + base_url: Base URL for the API endpoint + model: Model name to use for translation + source_lang: Optional source language code or name (e.g., 'es' or 'spanish') + target_lang: Target language code or name (e.g., 'fr' or 'french'), defaults to English + """ + self.client = OpenAI( + api_key=api_key, + base_url=base_url + ) + self.model = model + self.source_lang = self._normalize_language_code( + source_lang) if source_lang else None + self.target_lang = self._normalize_language_code( + target_lang) if target_lang else 'en' + logger.info(f"Initialized translator with model: {model}") + logger.info(f"Using API endpoint: {base_url}") + if self.source_lang: + logger.info( + f"Source language explicitly set to: {self.source_lang}") + logger.info(f"Target language set to: {self.target_lang}") + + def _normalize_language_code(self, lang: str) -> str: + """ + Normalize language input to standard language code. + + Args: + lang: Language name or code (e.g., 'japanese' or 'ja') + + Returns: + Normalized language code + """ + if not lang: + return None + + lang = lang.lower() + + # If it's already a valid language code, return it + if lang in SUPPORTED_LANGUAGES.values(): + return lang + + # If it's a language name, convert to code + if lang in SUPPORTED_LANGUAGES: + return SUPPORTED_LANGUAGES[lang] + + # If we can't normalize it, log a warning and return as-is + logger.warning(f"Unrecognized language specification: {lang}") + return lang + + def _get_language_name(self, lang_code: str) -> str: + """ + Get the language name from a language code. + + Args: + lang_code: Language code (e.g., 'ja') + + Returns: + Language name (e.g., 'Japanese') + """ + for name, code in SUPPORTED_LANGUAGES.items(): + if code == lang_code: + return name.capitalize() + return lang_code.upper() + + def read_file(self, file_path: str) -> str: + """Read the content of a file.""" + with open(file_path, 'r', encoding='utf-8') as file: + return file.read() + + def _is_yaml_file(self, file_path: str) -> bool: + """Check if a file is a YAML file based on extension.""" + return file_path.lower().endswith(('.yaml', '.yml')) + + def split_into_blocks(self, content: str) -> List[Dict[str, str]]: + """ + Split markdown content into individual blocks (paragraphs, headers, tables, or lists). + + Args: + content: The markdown content as a string + + Returns: + List of dictionaries containing block type and content + """ + blocks = [] + current_lines = [] + current_type = "paragraph" + + def is_table_line(line: str) -> bool: + """Check if a line is part of a markdown table.""" + stripped = line.strip() + return stripped.startswith('|') or stripped.startswith('+-') or stripped.startswith('|-') + + def is_list_line(line: str) -> bool: + """Check if a line is part of a markdown list.""" + stripped = line.strip() + # Match numbered lists (1., 2., etc.) or bullet lists (-, *, +) + return bool(re.match(r'^\d+\.|\s*[-*+]', stripped)) + + def is_indented(line: str) -> bool: + """Check if a line is indented (part of a list or code block).""" + return line.startswith(' ') or line.startswith('\t') + + def add_block(lines, type_="paragraph"): + if lines: + content = '\n'.join(lines).strip() + if content: # Only add non-empty blocks + blocks.append({ + "type": type_, + "content": content + }) + + lines = content.split('\n') + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + + # Handle headers + if stripped.startswith('#'): + add_block(current_lines, current_type) + current_lines = [line] + add_block(current_lines, "header") + current_lines = [] + current_type = "paragraph" + + # Handle tables + elif is_table_line(line): + if current_type != "table": + add_block(current_lines, current_type) + current_lines = [] + current_type = "table" + current_lines.append(line) + + # Handle lists + elif is_list_line(line) or (current_type == "list" and (stripped or is_indented(line))): + if current_type != "list": + add_block(current_lines, current_type) + current_lines = [] + current_type = "list" + current_lines.append(line) + + # Handle code blocks + elif stripped.startswith('```'): + if current_type != "code": + add_block(current_lines, current_type) + current_lines = [] + current_type = "code" + current_lines.append(line) + # Skip until we find the closing code block + if len(current_lines) == 1: # Opening fence + i += 1 + while i < len(lines) and not lines[i].strip().startswith('```'): + current_lines.append(lines[i]) + i += 1 + if i < len(lines): + current_lines.append(lines[i]) # Add closing fence + + # Handle empty lines + elif not stripped: + if current_type in ["table", "list", "code"]: + # For tables, lists and code blocks, preserve empty lines + current_lines.append(line) + else: + # For paragraphs, use empty lines as separators + add_block(current_lines, current_type) + current_lines = [] + current_type = "paragraph" + + # Handle regular paragraph lines + else: + if current_type in ["table", "list", "code"]: + # Check if we're actually continuing the special block + if (current_type == "table" and not is_table_line(line)) or \ + (current_type == "list" and not (is_list_line(line) or is_indented(line))): + add_block(current_lines, current_type) + current_lines = [] + current_type = "paragraph" + current_lines.append(line) + + i += 1 + + # Add the final block + add_block(current_lines, current_type) + + return blocks + + def split_yaml_into_blocks(self, content: str) -> List[Dict[str, Any]]: + """ + Split YAML content into blocks, preserving structure. + + Args: + content: YAML content as a string + + Returns: + List of dictionaries with translatable parts and their structure + """ + blocks = [] + + try: + # Parse the YAML content + yaml_data = yaml.safe_load(content) + + # Process the YAML data recursively + self._process_yaml_node(yaml_data, blocks) + + return blocks + except yaml.YAMLError as e: + logger.error(f"Error parsing YAML content: {str(e)}") + # If parsing fails, treat as a single block + blocks.append({ + "type": "yaml_content", + "content": content, + "structure": None + }) + return blocks + + def _process_yaml_node(self, node: Any, blocks: List[Dict[str, Any]], path: List[str] = None) -> None: + """ + Process a YAML node recursively, extracting translatable strings. + + Args: + node: The current YAML node + blocks: List to store translatable blocks + path: Current path in the YAML structure + """ + if path is None: + path = [] + + if isinstance(node, dict): + # Process dictionary + for key, value in node.items(): + new_path = path + [key] + if isinstance(value, (str, int, float, bool)) and isinstance(value, str): + # Translatable leaf node + blocks.append({ + "type": "yaml_value", + "content": value, + "path": new_path, + "key": key + }) + elif isinstance(value, (dict, list)): + # Recursively process nested structures + self._process_yaml_node(value, blocks, new_path) + + elif isinstance(node, list): + # Process list + for i, item in enumerate(node): + new_path = path + [str(i)] + if isinstance(item, (str, int, float, bool)) and isinstance(item, str): + # Translatable leaf node + blocks.append({ + "type": "yaml_value", + "content": item, + "path": new_path, + "index": i + }) + elif isinstance(item, (dict, list)): + # Recursively process nested structures + self._process_yaml_node(item, blocks, new_path) + + def detect_language(self, text: str) -> str: + """ + Detect the language of a text while ignoring markdown formatting. + Returns explicitly set language if specified, otherwise attempts detection. + + Args: + text: Text to detect language from + + Returns: + Language code (e.g., 'en' for English) + """ + # If source language is explicitly set, use it + if self.source_lang: + return self.source_lang + + try: + # Remove markdown formatting for better language detection + cleaned_text = re.sub(r'[#*_`|[-\]()]+', ' ', text) + cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() + + if not cleaned_text: # If text is empty or only contains markdown + return 'unknown' + + return langdetect.detect(cleaned_text) + except langdetect.LangDetectException: + return 'unknown' + + def _extract_code_comments(self, code: str) -> List[Dict[str, str]]: + """ + Extract comments from code while preserving their position and format. + + Args: + code: String containing code with comments + + Returns: + List of dictionaries containing comment info (text, start_pos, end_pos, type) + """ + comments = [] + lines = code.split('\n') + current_pos = 0 + + def is_in_string(line: str, pos: int) -> bool: + """Check if position is inside a string literal.""" + in_single = False + in_double = False + escaped = False + + for i in range(pos): + if line[i] == '\\': + escaped = not escaped + continue + + if not escaped: + if line[i] == "'": + if not in_double: + in_single = not in_single + elif line[i] == '"': + if not in_single: + in_double = not in_double + + escaped = False + + return in_single or in_double + + i = 0 + while i < len(lines): + line = lines[i] + line_start = current_pos + + # Handle single-line comments + if '//' in line: # JavaScript/C-style + pos = line.index('//') + if not is_in_string(line, pos): + comments.append({ + 'text': line[pos:], + 'start_pos': line_start + pos, + 'end_pos': line_start + len(line), + 'type': 'single' + }) + + elif '#' in line: # Python-style + pos = line.index('#') + if not is_in_string(line, pos): + comments.append({ + 'text': line[pos:], + 'start_pos': line_start + pos, + 'end_pos': line_start + len(line), + 'type': 'single' + }) + + # Handle multi-line comments + elif '/*' in line: # JavaScript/C-style + pos = line.index('/*') + if not is_in_string(line, pos): + comment_text = [line[pos:]] + comment_start = line_start + pos + i += 1 + + # Collect comment lines until we find the closing */ + while i < len(lines): + if '*/' in lines[i]: + end_pos = lines[i].index('*/') + 2 + comment_text.append(lines[i][:end_pos]) + comments.append({ + 'text': '\n'.join(comment_text), + 'start_pos': comment_start, + 'end_pos': current_pos + len(lines[i]), + 'type': 'multi' + }) + break + else: + comment_text.append(lines[i]) + i += 1 + current_pos += len(lines[i]) + 1 + + elif '"""' in line or "'''" in line: # Python docstring + if line.count('"""') == 1 or line.count("'''") == 1: # Opening quote + marker = '"""' if '"""' in line else "'''" + pos = line.index(marker) + if not is_in_string(line[:pos], pos): + comment_text = [line[pos:]] + comment_start = line_start + pos + i += 1 + + # Collect docstring lines until we find the closing quotes + while i < len(lines): + if marker in lines[i]: + end_pos = lines[i].index(marker) + 3 + comment_text.append(lines[i][:end_pos]) + comments.append({ + 'text': '\n'.join(comment_text), + 'start_pos': comment_start, + 'end_pos': current_pos + end_pos, + 'type': 'docstring' + }) + break + else: + comment_text.append(lines[i]) + i += 1 + current_pos += len(lines[i]) + 1 + + current_pos += len(line) + 1 + i += 1 + + return comments + + def translate_yaml_block(self, block: Dict[str, Any]) -> Dict[str, Any]: + """ + Translate a YAML value block. + + Args: + block: Dictionary with YAML value and structure info + + Returns: + Dictionary with translated content and original structure info + """ + content = block["content"] + + # Don't translate if it's not a string + if not isinstance(content, str) or not content.strip(): + return block + + # Get the language (either specified or detected) + detected_lang = self.detect_language(content) + + # If it's already in the target language or can't be detected, return original + if detected_lang == self.target_lang or (detected_lang == 'unknown' and not self.source_lang): + return block + + # Get language names for clearer prompts + source_lang_name = self._get_language_name( + detected_lang) if detected_lang != 'unknown' else "the source language" + target_lang_name = self._get_language_name(self.target_lang) + + # Prepare translation prompt + system_prompt = f"""You are a YAML content translator. + + Rules: + 1. Translate the text from {source_lang_name} to {target_lang_name} + 2. Preserve any special formatting, variables, or placeholders exactly as they appear + 3. Return ONLY the translated text without any additional text + 4. Keep line breaks exactly as they appear in the original + 5. Maintain the exact same meaning in the translation + """ + + user_prompt = f"""Translate this YAML value from {source_lang_name} to {target_lang_name}, preserving all formatting and variables: + + + {content} + + + IMPORTANT: Return only the translated text. Do not include any tags, explanations, or other markers in your response. + Preserve all variable placeholders, brackets, braces like {{variable}}, and special characters exactly as they appear. + """ + + try: + # Make the API call + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.3 + ) + + translated_text = response.choices[0].message.content.strip() + + # Clean up any unwanted elements from the response + translated_text = self._clean_response(translated_text) + + # Return updated block + translated_block = block.copy() + translated_block["content"] = translated_text + + return translated_block + + except Exception as e: + logger.error(f"Error translating YAML block: {str(e)}") + return block # Return original block if translation fails + + def translate_block(self, block: Dict[str, str]) -> Dict[str, str]: + """ + Translate a single block using the configured API. + + Args: + block: Dictionary containing block type and content + + Returns: + Dictionary with translated content + """ + # For code blocks, only translate comments + if block["type"] == "code": + # Extract the actual code content (remove the ```language and ``` markers) + code_lines = block["content"].split('\n') + if len(code_lines) >= 2: # Valid code block with markers + code_content = '\n'.join(code_lines[1:-1]) # Remove markers + code_language = code_lines[0].replace('```', '').strip() + + # Extract comments + comments = self._extract_code_comments(code_content) + if not comments: # No comments to translate + return block + + # Translate each comment + translated_code = code_content + # Process from end to avoid position shifts + for comment in reversed(comments): + comment_text = comment['text'] + detected_lang = self.detect_language(comment_text) + + # Only translate if source and target languages are different + if detected_lang != self.target_lang and detected_lang != 'unknown': + # Prepare translation prompt for comment + target_lang_name = self._get_language_name( + self.target_lang) + system_prompt = f"""You are a code comment translator. + Translate only the comment text from {self._get_language_name(detected_lang)} to {target_lang_name} while preserving any comment markers and formatting. + If the comment contains code examples, variable names, or function names, keep those unchanged. + Preserve any special comment markers (e.g., @param, @return, TODO:, FIXME:, etc.).""" + + user_prompt = f"Translate this code comment to {target_lang_name}:\n\n{comment_text}" + + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.3 + ) + + translated_comment = response.choices[0].message.content.strip( + ) + # Replace the comment in the code while preserving indentation + translated_code = ( + translated_code[:comment['start_pos']] + + translated_comment + + translated_code[comment['end_pos']:] + ) + except Exception as e: + logger.error( + f"Error translating comment: {str(e)}") + + # Reconstruct the code block with markers + return { + "type": "code", + "content": f"```{code_language}\n{translated_code}\n```" + } + + # For non-code blocks, handle regular markdown translation + special_instructions = { + "table": "- Output the table with exact same structure, only translated content in cells\n- Keep all | and - characters in their exact positions", + "list": "- Output the list with exact same markers and indentation\n- Keep all list markers (-, *, +, numbers) exactly as they appear", + "code": "- Output the code block with exact same structure\n- Only translate comments, keep code unchanged", + "header": "- Output the header with exact same # symbols\n- Keep same number of # characters at the start", + "paragraph": "- Output the paragraph with exact same inline formatting\n- Keep all *bold*, _italic_, [links](urls) exactly as they appear" + } + + # Get the language (either specified or detected) + detected_lang = self.detect_language(block["content"]) + logger.debug( + f"Language: {detected_lang} ({'specified' if self.source_lang else 'detected'})") + + # If it's already in the target language or can't be detected, return original + if detected_lang == self.target_lang or (detected_lang == 'unknown' and not self.source_lang): + logger.debug( + f"Text is already in {self._get_language_name(self.target_lang)} or language cannot be detected. Skipping translation.") + return block + + # Get language names for clearer prompts + source_lang_name = self._get_language_name( + detected_lang) if detected_lang != 'unknown' else "the source language" + target_lang_name = self._get_language_name(self.target_lang) + + # Updated system prompt to be more explicit about not including the markers + system_prompt = f"""You are a markdown translator that only outputs the translated content. + + Rules: + 1. Translate the content between the tags "" and "" from {source_lang_name} to {target_lang_name} + 2. Keep all markdown formatting exactly as is + 3. Return ONLY the translated content without ANY additional text + 4. Keep line breaks exactly as they appear in the original + 5. DO NOT include tags like "" or "" in your response + 6. Respond ONLY with the translated text and nothing else + + For this {block["type"]}: + {special_instructions.get(block["type"], "")}""" + + user_prompt = f"""Translate this {block["type"]} from {source_lang_name} to {target_lang_name}, preserving all formatting. + + + {block['content']} + + + IMPORTANT: Output only the translated text. Do not include any tags, or other markers in your response. Preserve the exact same markdown formatting of the original content.""" + + try: + # Log the request + logger.debug("\n=== REQUEST ===") + logger.debug(f"Model: {self.model}") + logger.debug("Messages:") + logger.debug(json.dumps([ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], ensure_ascii=False, indent=2)) + + # Make the API call + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.3 + ) + + translated_text = response.choices[0].message.content.strip() + + # Clean up any unwanted elements from the response + translated_text = self._clean_response(translated_text) + + # Log the response + logger.debug("\n=== RESPONSE ===") + logger.debug(f"Response content:") + logger.debug(json.dumps(translated_text, + ensure_ascii=False, indent=2)) + + # Verify the translation preserves markdown structure + if block["type"] == "header" and not translated_text.startswith('#'): + logger.warning( + "Translation lost header formatting, attempting to restore") + original_hashes = re.match(r'^#+', block["content"]).group(0) + translated_text = f"{original_hashes} {translated_text.lstrip('#').lstrip()}" + + return { + "type": block["type"], + "content": translated_text + } + + except Exception as e: + logger.error(f"Error translating block: {str(e)}") + return block # Return original block if translation fails + + + def _clean_response(self, text: str) -> str: + """ + Clean up any unwanted tags or markers from the translated text. + + Args: + text: The text to clean + + Returns: + Cleaned text with unwanted elements removed + """ + # Remove translation tags + text = re.sub(r'', '', text, flags=re.IGNORECASE) + + # Remove triple backticks at the beginning and end + text = re.sub(r'^```[a-zA-Z]*\s*', '', text) + text = re.sub(r'\s*```\s*$', '', text) + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()) + + # Remove any instruction repetition that might be included + patterns_to_remove = [ + r'INSTRUCTIONS:.* + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()), + r'IMPORTANT:.* + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()), + r'Output only the translated text\..* + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()), + r'Do not include.* + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()), + r'Translated content:.* + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()), + r'Translation:.* + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main()) + ] + + for pattern in patterns_to_remove: + text = re.sub(pattern, '', text, flags=re.MULTILINE) + + return text.strip() + + def _reconstruct_yaml(self, yaml_data: Dict[str, Any], translated_blocks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Reconstruct YAML with translated values. + + Args: + yaml_data: Original YAML data + translated_blocks: List of translated blocks with path information + + Returns: + Updated YAML data with translations + """ + # Create a deep copy of the original data + result = yaml_data.copy() + + # Apply translations + for block in translated_blocks: + if block["type"] != "yaml_value": + continue + + path = block["path"] + current = result + + # Navigate to the parent + for i, key in enumerate(path[:-1]): + if isinstance(current, dict): + if key not in current: + break + current = current[key] + elif isinstance(current, list): + idx = int(key) + if idx >= len(current): + break + current = current[idx] + else: + break + + # Update the value + last_key = path[-1] + if isinstance(current, dict) and last_key in current: + current[last_key] = block["content"] + elif isinstance(current, list): + idx = int(last_key) + if idx < len(current): + current[idx] = block["content"] + + return result + + def translate_content( + self, + input_file: str, + output_file: Optional[str] = None, + source_lang: Optional[str] = None, + target_lang: Optional[str] = None, + verbose: bool = True, + log_level: str = "INFO" + ) -> str: + """ + Translate a file (markdown or YAML) and save the result. + + Args: + input_file: Path to input file + output_file: Optional path to save translated file + source_lang: Optional source language override + target_lang: Optional target language override + verbose: Whether to print progress information + log_level: Logging level to use + + Returns: + Translated content as string + """ + # Set logging level for this translation + logger.setLevel(getattr(logging, log_level.upper())) + + # Save original language settings + original_source_lang = self.source_lang + original_target_lang = self.target_lang + + # Allow overriding source and target languages for this specific translation + if source_lang: + self.source_lang = self._normalize_language_code(source_lang) + logger.info( + f"Source language temporarily set to: {self.source_lang}") + + if target_lang: + self.target_lang = self._normalize_language_code(target_lang) + logger.info( + f"Target language temporarily set to: {self.target_lang}") + + try: + # Read the content of the file + content = self.read_file(input_file) + + # Check if the file is YAML + is_yaml = self._is_yaml_file(input_file) + + if is_yaml: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + # Parse the YAML content + # Process the file based on its type + if is_yaml: + try: + if verbose: + logger.info(f"Processing {input_file} as YAML") + + original_yaml = yaml.safe_load(content) + blocks = self.split_yaml_into_blocks(content) + + # Translate each block + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = str(block["content"])[:50] + "..." if len(str(block["content"])) > 50 else str(block["content"]) + logger.info(f"Processing YAML block {i}/{total_blocks}: {block_preview}") + + translated = self.translate_yaml_block(block) + translated_blocks.append(translated) + + # Reconstruct the YAML structure with translated values + translated_yaml = self._reconstruct_yaml(original_yaml, translated_blocks) + + # Convert back to YAML string with preserved formatting + translated_content = yaml.dump(translated_yaml, allow_unicode=True, sort_keys=False) + except Exception as e: + logger.error(f"YAML translation failed: {str(e)}") + raise + else: + if verbose: + logger.info(f"Processing {input_file} as Markdown") + + # Split the markdown content into blocks + blocks = self.split_into_blocks(content) + + translated_blocks = [] + total_blocks = len(blocks) + + for i, block in enumerate(blocks, 1): + if verbose: + block_preview = block["content"][:50] + "..." if len(block["content"]) > 50 else block["content"] + logger.info(f"Processing block {i}/{total_blocks} ({block['type']}): {block_preview}") + + translated = self.translate_block(block) + translated_blocks.append(translated) + + # Combine translated blocks with appropriate spacing + translated_content = "" + for i, block in enumerate(translated_blocks): + if i > 0: # Add spacing between blocks + if block["type"] == "header" or translated_blocks[i-1]["type"] == "header": + translated_content += "\n\n" + elif block["type"] == "paragraph" and translated_blocks[i-1]["type"] == "paragraph": + translated_content += "\n\n" + translated_content += block["content"] + + # Save the output and return + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(translated_content) + if verbose: + logger.info(f"Translation saved to: {output_file}") + + return translated_content + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + finally: + # Restore original language settings + self.source_lang = original_source_lang + self.target_lang = original_target_lang + + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser( + description='Translate markdown and YAML files between languages using AI.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Required arguments + parser.add_argument( + 'input_file', + type=str, + help='Path to the input file (markdown or YAML)' + ) + + # Optional arguments + parser.add_argument( + '-o', '--output', + type=str, + help='Path to save the translated file. If not provided, will use input_file_translated.md/yml' + ) + parser.add_argument( + '--source-lang', + type=str, + help='Source language code or name (e.g., "es" or "spanish")' + ) + parser.add_argument( + '--target-lang', + type=str, + default='en', + help='Target language code or name (e.g., "fr" or "french"), defaults to English' + ) + parser.add_argument( + '--api-key', + type=str, + default=os.getenv("API_KEY", "no-key"), + help='API key for the translation service' + ) + parser.add_argument( + '--base-url', + type=str, + default=os.getenv("BASE_URL", "http://localhost:11434/v1/"), + help='Base URL for the API endpoint' + ) + parser.add_argument( + '--model', + type=str, + default=os.getenv("MODEL_NAME", "granite3.1-dense:latest"), + help='Model name to use for translation' + ) + parser.add_argument( + '--log-level', + type=str, + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default=os.getenv("LOG_LEVEL", "INFO"), + help='Set the logging level' + ) + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress information' + ) + parser.add_argument( + '--list-languages', + action='store_true', + help='List all supported languages and exit' + ) + + args = parser.parse_args() + + # If the user just wants to list supported languages + if args.list_languages: + print("Supported Languages:") + for name, code in sorted(SUPPORTED_LANGUAGES.items()): + print(f" {name.capitalize()} ({code})") + return 0 + + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + logger = logging.getLogger("ContentTranslator") + + # Log configuration + logger.info(f"Using model: {args.model}") + logger.info(f"Using base URL: {args.base_url}") + + # Verify input file exists + if not os.path.exists(args.input_file): + logger.error(f"Input file not found: {args.input_file}") + return 1 + + # Set default output file if not provided + if not args.output: + input_path = Path(args.input_file) + target_lang_code = args.target_lang.lower() + target_lang_code = next((code for name, code in SUPPORTED_LANGUAGES.items() + if name.lower() == target_lang_code.lower() + or code.lower() == target_lang_code.lower()), + target_lang_code) + args.output = str( + input_path.parent / f"{input_path.stem}_{target_lang_code}{input_path.suffix}") + + # Create translator instance + translator = ContentTranslator( + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + source_lang=args.source_lang, + target_lang=args.target_lang + ) + + try: + translated_content = translator.translate_content( + args.input_file, + args.output, + source_lang=args.source_lang, + target_lang=args.target_lang, + verbose=not args.quiet, + log_level=args.log_level + ) + logger.info( + f"Translation completed successfully! Output saved to: {args.output}") + return 0 + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + return 1 + + +if __name__ == "__main__": + exit(main())