Skip to content
171 changes: 150 additions & 21 deletions convert_to_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,24 @@
and set START_AT_PAGE to that number (include the padded 0's). Then run script again, it should pick up
where it left off.

__author__ = "Shane DeMorais"
__copyright__ = "Copyright 2023, Planet Earth (I think)"
__author__ = "Zindan Kurt (Enhanced from Shane DeMorais)"
__copyright__ = "Copyright 2025"
__contributors__ = "Enhanced for Pine Script v6 with improved error handling and formatting"
"""

import logging
import pdfkit
import os
import re
import tempfile

import pdfkit
import requests
import lxml
from bs4 import BeautifulSoup
import os
import sys
import re


class Constant:
PINESCRIPT_MANUAL_URL = "https://www.tradingview.com/pine-script-docs/en/v5/index.html"
DOMAIN_NAME = "https://www.tradingview.com/pine-script-docs/en/v5/"
PINESCRIPT_MANUAL_URL = "https://www.tradingview.com/pine-script-docs"
DOMAIN_NAME = "https://www.tradingview.com/pine-script-docs"
START_AT_CHAPTER = ""
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
Expand All @@ -47,7 +47,7 @@ def create_pdf_name(chapter_num, anchor):
anchor = anchor.replace("/", "_").replace(".html", "") + ".pdf"
chapter = f'{chapter_num:05d}_'

pdf_name = "./chapters/" + chapter + anchor
pdf_name = "./chapters_v6/" + chapter + anchor

return pdf_name

Expand All @@ -66,9 +66,16 @@ def find_chapters(start_url):
f = requests.get(start_url, headers=Constant.HEADERS)

soup = BeautifulSoup(f.content, 'lxml')
chapters = soup.find('div', {
'class': 'toctree-wrapper'
}).find_all('a')

# Try different selectors for the table of contents
toc_container = (soup.find('div', {'class': 'sidebar'}) or
soup.find('aside', {'class': 'keep-visible'}) or
soup.find('div', {'class': 'toctree-wrapper'}))

if not toc_container:
raise Exception("Could not find table of contents container")

chapters = toc_container.find_all('a')

chapters = prune_subchapters(chapters)

Expand All @@ -78,19 +85,128 @@ def find_chapters(start_url):


def save_html_as_pdf(anchor, pdf_name):
url = Constant.DOMAIN_NAME + anchor
# Handle absolute paths that start with /
if anchor.startswith('/'):
url = "https://www.tradingview.com" + anchor
else:
url = Constant.DOMAIN_NAME + anchor

try:
pdfkit.from_url(url, pdf_name)
# Fetch the HTML content
response = requests.get(url, headers=Constant.HEADERS)
response.raise_for_status()

# Parse and modify the HTML
soup = BeautifulSoup(response.content, 'lxml')

# Convert relative image URLs to absolute URLs
base_url = "https://www.tradingview.com"
for img in soup.find_all('img'):
src = img.get('src')
if src:
if src.startswith('/'):
# Relative URL starting with /
img['src'] = base_url + src
elif src.startswith('../'):
# Relative URL with ../
img['src'] = base_url + src.replace('../', '/')
elif not src.startswith(('http://', 'https://', 'data:')):
# Other relative URLs
img['src'] = base_url + '/' + src.lstrip('./')

# Also fix any CSS background images
for element in soup.find_all(style=True):
style = element.get('style', '')
if 'background-image' in style or 'background:' in style:
# Simple regex to find url() references
def fix_css_url(match):
css_url = match.group(1).strip('\'"')
if css_url.startswith('/'):
return f'url("{base_url}{css_url}")'
elif not css_url.startswith(('http://', 'https://', 'data:')):
return 'url("' + base_url + '/' + css_url.lstrip('./') + '")'
return match.group(0)

style = re.sub(r'url\([\'"]?([^\'")]+)[\'"]?\)', fix_css_url, style)
element['style'] = style

# Add custom CSS to make content wider
custom_css = """
<style>
body {
max-width: none !important;
width: 100% !important;
margin: 0 !important;
padding: 20px !important;
}
.container, .content, .main-content, .documentation-content {
max-width: none !important;
width: 100% !important;
margin: 0 !important;
}
.sidebar {
display: none !important;
}
article, main {
max-width: none !important;
width: 100% !important;
}
p, div, span {
max-width: none !important;
}
@media print {
body { width: 100% !important; max-width: none !important; }
.sidebar { display: none !important; }
}
</style>
"""

# Insert the custom CSS into the head
if soup.head:
soup.head.append(BeautifulSoup(custom_css, 'html.parser'))

# Create a temporary HTML file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as temp_file:
temp_file.write(str(soup))
temp_html_path = temp_file.name

# Options for PDF generation
options = {
'page-size': 'A3',
'margin-top': '0.4in',
'margin-right': '0.4in',
'margin-bottom': '0.4in',
'margin-left': '0.4in',
'encoding': "UTF-8",
'orientation': 'Portrait',
'quiet': '',
'load-error-handling': 'ignore',
'load-media-error-handling': 'ignore'
}

# Convert the temporary HTML file to PDF
pdfkit.from_file(temp_html_path, pdf_name, options=options)

# Clean up the temporary file
os.unlink(temp_html_path)

logging.info(f"- Downloaded {pdf_name}")
return pdf_name

except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)

sys.exit(message)

return pdf_name

logging.error(f"Failed to download {pdf_name}: {message}")

# Clean up temporary file if it exists
try:
if 'temp_html_path' in locals():
os.unlink(temp_html_path)
except:
pass

return None


def download_chapter(chapters):
Expand All @@ -99,6 +215,9 @@ def download_chapter(chapters):
download_html = False if len(Constant.START_AT_CHAPTER) > 0 else True

chapter_num = 1
successful_downloads = 0
failed_downloads = 0

for anchor in chapters:
anchor = anchor['href']

Expand All @@ -111,13 +230,23 @@ def download_chapter(chapters):

if download_html:
logging.info(f"Downloading {pdf_name}")
save_html_as_pdf(anchor, pdf_name)
result = save_html_as_pdf(anchor, pdf_name)

if result:
successful_downloads += 1
else:
failed_downloads += 1
logging.warning("Skipping failed download and continuing with next chapter...")

chapter_num += 1

logging.info(f"Download complete! Successful: {successful_downloads}, Failed: {failed_downloads}")


def main(start_url):

# Create output directory if it doesn't exist
os.makedirs("./chapters_v6", exist_ok=True)

try:
chapters = find_chapters(start_url)
download_chapter(chapters)
Expand Down