shanedemorais · zindankurt · Jun 2, 2025 · Jun 2, 2025 · Jun 2, 2025 · Jun 2, 2025
diff --git a/convert_to_pdf.py b/convert_to_pdf.py
@@ -8,24 +8,24 @@
 and set START_AT_PAGE to that number (include the padded 0's).  Then run script again, it should pick up
 where it left off.
 
-__author__      = "Shane DeMorais"
-__copyright__   = "Copyright 2023, Planet Earth (I think)"
+__author__      = "Zindan Kurt (Enhanced from Shane DeMorais)"
+__copyright__   = "Copyright 2025"
+__contributors__ = "Enhanced for Pine Script v6 with improved error handling and formatting"
 """
 
 import logging
-import pdfkit
+import os
+import re
+import tempfile
 
+import pdfkit
 import requests
-import lxml
 from bs4 import BeautifulSoup
-import os
-import sys
-import re
 
 
 class Constant:
-    PINESCRIPT_MANUAL_URL = "https://www.tradingview.com/pine-script-docs/en/v5/index.html"
-    DOMAIN_NAME = "https://www.tradingview.com/pine-script-docs/en/v5/"
+    PINESCRIPT_MANUAL_URL = "https://www.tradingview.com/pine-script-docs"
+    DOMAIN_NAME = "https://www.tradingview.com/pine-script-docs"
     START_AT_CHAPTER = ""
     HEADERS = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
@@ -47,7 +47,7 @@ def create_pdf_name(chapter_num, anchor):
     anchor = anchor.replace("/", "_").replace(".html", "") + ".pdf"
     chapter = f'{chapter_num:05d}_'
 
-    pdf_name = "./chapters/" + chapter + anchor
+    pdf_name = "./chapters_v6/" + chapter + anchor
 
     return pdf_name
 
@@ -66,9 +66,16 @@ def find_chapters(start_url):
     f = requests.get(start_url, headers=Constant.HEADERS)
 
     soup = BeautifulSoup(f.content, 'lxml')
-    chapters = soup.find('div', {
-        'class': 'toctree-wrapper'
-    }).find_all('a')
+
+    # Try different selectors for the table of contents
+    toc_container = (soup.find('div', {'class': 'sidebar'}) or 
+                    soup.find('aside', {'class': 'keep-visible'}) or
+                    soup.find('div', {'class': 'toctree-wrapper'}))
+
+    if not toc_container:
+        raise Exception("Could not find table of contents container")
+
+    chapters = toc_container.find_all('a')
 
     chapters = prune_subchapters(chapters)
 
@@ -78,19 +85,128 @@ def find_chapters(start_url):
 
 
 def save_html_as_pdf(anchor, pdf_name):
-    url = Constant.DOMAIN_NAME + anchor
+    # Handle absolute paths that start with /
+    if anchor.startswith('/'):
+        url = "https://www.tradingview.com" + anchor
+    else:
+        url = Constant.DOMAIN_NAME + anchor
 
     try:
-        pdfkit.from_url(url, pdf_name)
+        # Fetch the HTML content
+        response = requests.get(url, headers=Constant.HEADERS)
+        response.raise_for_status()
+
+        # Parse and modify the HTML
+        soup = BeautifulSoup(response.content, 'lxml')
+
+        # Convert relative image URLs to absolute URLs
+        base_url = "https://www.tradingview.com"
+        for img in soup.find_all('img'):
+            src = img.get('src')
+            if src:
+                if src.startswith('/'):
+                    # Relative URL starting with /
+                    img['src'] = base_url + src
+                elif src.startswith('../'):
+                    # Relative URL with ../
+                    img['src'] = base_url + src.replace('../', '/')
+                elif not src.startswith(('http://', 'https://', 'data:')):
+                    # Other relative URLs
+                    img['src'] = base_url + '/' + src.lstrip('./')
+
+        # Also fix any CSS background images
+        for element in soup.find_all(style=True):
+            style = element.get('style', '')
+            if 'background-image' in style or 'background:' in style:
+                # Simple regex to find url() references
+                def fix_css_url(match):
+                    css_url = match.group(1).strip('\'"')
+                    if css_url.startswith('/'):
+                        return f'url("{base_url}{css_url}")'
+                    elif not css_url.startswith(('http://', 'https://', 'data:')):
+                        return 'url("' + base_url + '/' + css_url.lstrip('./') + '")'
+                    return match.group(0)
+
+                style = re.sub(r'url\([\'"]?([^\'")]+)[\'"]?\)', fix_css_url, style)
+                element['style'] = style
+
+        # Add custom CSS to make content wider
+        custom_css = """
+        <style>
+        body { 
+            max-width: none !important; 
+            width: 100% !important; 
+            margin: 0 !important; 
+            padding: 20px !important; 
+        }
+        .container, .content, .main-content, .documentation-content { 
+            max-width: none !important; 
+            width: 100% !important; 
+            margin: 0 !important; 
+        }
+        .sidebar { 
+            display: none !important; 
+        }
+        article, main { 
+            max-width: none !important; 
+            width: 100% !important; 
+        }
+        p, div, span { 
+            max-width: none !important; 
+        }
+        @media print {
+            body { width: 100% !important; max-width: none !important; }
+            .sidebar { display: none !important; }
+        }
+        </style>
+        """
+
+        # Insert the custom CSS into the head
+        if soup.head:
+            soup.head.append(BeautifulSoup(custom_css, 'html.parser'))
+
+        # Create a temporary HTML file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as temp_file:
+            temp_file.write(str(soup))
+            temp_html_path = temp_file.name
+
+        # Options for PDF generation
+        options = {
+            'page-size': 'A3',
+            'margin-top': '0.4in',
+            'margin-right': '0.4in',
+            'margin-bottom': '0.4in',
+            'margin-left': '0.4in',
+            'encoding': "UTF-8",
+            'orientation': 'Portrait',
+            'quiet': '',
+            'load-error-handling': 'ignore',
+            'load-media-error-handling': 'ignore'
+        }
+
+        # Convert the temporary HTML file to PDF
+        pdfkit.from_file(temp_html_path, pdf_name, options=options)
+
+        # Clean up the temporary file
+        os.unlink(temp_html_path)
+
         logging.info(f"- Downloaded {pdf_name}")
+        return pdf_name
 
     except Exception as ex:
         template = "An exception of type {0} occurred. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
-
-        sys.exit(message)
-
-    return pdf_name
+
+        logging.error(f"Failed to download {pdf_name}: {message}")
+
+        # Clean up temporary file if it exists
+        try:
+            if 'temp_html_path' in locals():
+                os.unlink(temp_html_path)
+        except:
+            pass
+
+        return None
 
 
 def download_chapter(chapters):
@@ -99,6 +215,9 @@ def download_chapter(chapters):
     download_html = False if len(Constant.START_AT_CHAPTER) > 0 else True
 
     chapter_num = 1
+    successful_downloads = 0
+    failed_downloads = 0
+
     for anchor in chapters:
         anchor = anchor['href']
 
@@ -111,13 +230,23 @@ def download_chapter(chapters):
 
         if download_html:
             logging.info(f"Downloading {pdf_name}")
-            save_html_as_pdf(anchor, pdf_name)
+            result = save_html_as_pdf(anchor, pdf_name)
+
+            if result:
+                successful_downloads += 1
+            else:
+                failed_downloads += 1
+                logging.warning("Skipping failed download and continuing with next chapter...")
 
         chapter_num += 1
+
+    logging.info(f"Download complete! Successful: {successful_downloads}, Failed: {failed_downloads}")
 
 
 def main(start_url):
-
+    # Create output directory if it doesn't exist
+    os.makedirs("./chapters_v6", exist_ok=True)
+
     try:
         chapters = find_chapters(start_url)
         download_chapter(chapters)