hacktricks-wiki/generate_sitemap.py at main · HackTricks-wiki/hacktricks-wiki · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import requests
import xml.etree.ElementTree as ET
from xml.dom import minidom
from tqdm import tqdm
import re
import urllib.parse

# --------------------------------------------------------------------
# 1) Definitions & Constants
# --------------------------------------------------------------------
SUMMARY_URL_BOOK = "https://raw.githubusercontent.com/HackTricks-wiki/hacktricks/refs/heads/master/src/SUMMARY.md"
SUMMARY_URL_CLOUD = "https://raw.githubusercontent.com/HackTricks-wiki/hacktricks-cloud/refs/heads/master/src/SUMMARY.md"

BOOK_DOMAIN = "book.hacktricks.wiki"
CLOUD_DOMAIN = "cloud.hacktricks.wiki"

# Dictionary of languages and their codes
languages = {
    "es": "es",
    "af": "af",
    "zh": "zh",
    "fr": "fr",
    "de": "de",
    "el": "el",
    "hi": "hi",
    "it": "it",
    "ja": "ja",
    "ko": "ko",
    "pl": "pl",
    "pt": "pt",
    "sr": "sr",
    "sw": "sw",
    "tr": "tr",
    "uk": "uk",
}

# --------------------------------------------------------------------
# 2) Helper Functions
# --------------------------------------------------------------------
def fetch_summary(url):
    """Fetch the content of a SUMMARY.md-like file."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    return response.text

def parse_paths_from_summary(summary_text):
    """
    Parse the SUMMARY.md content and extract paths of the form:
       [Title](some/path.md)
       [Title](some/path/)
       [Title](some/README.md)
       etc.

    According to your instructions:
      - Do NOT remove '/index' paths
      - Change '.md' to '.html'
      - Change '/README.md' -> '/index.html'

    Returns a list of unique paths (without duplicates).
    """
    pattern = r"\[[^\]]+\]\(\s*([^)]+?)\s*\)"
    matches = re.findall(pattern, summary_text)

    cleaned_paths = []
    for path in matches:
        path = path.strip()

        # 1) Handle /README.md -> /index.html
        if path.endswith("README.md"):
            path = path[:-9] + "index.html"
        # 2) Else if it ends with .md -> .html
        elif path.endswith(".md"):
            path = path[:-3] + ".html"

        # Avoid duplicates
        if path not in cleaned_paths:
            cleaned_paths.append(path)

    return cleaned_paths

def compute_priority_from_depth(path):
    """
    The priority starts at 1.0 for depth 0,
    and each additional subfolder subtracts 0.1.
      Depth 0 => priority = 1.00
      Depth 1 => priority = 0.9
      Depth 2 => priority = 0.8
      ...
      Min 0.5
    """
    effective_path = path.strip('/')
    if not effective_path:
        depth = 0
    else:
        depth = effective_path.count('/')
    priority = 1.0 - (0.1 * depth)
    return max(priority, 0.5)

def prettify_xml(element):
    """Return a prettified string representation of the XML with XML declaration."""
    rough_string = ET.tostring(element, encoding='utf-8')
    reparsed = minidom.parseString(rough_string)
    pretty = reparsed.toprettyxml(indent="  ", encoding="UTF-8")
    return pretty.decode('UTF-8')

def add_translated_urls(url_element, base_domain, path):
    """
    Add translated URLs with language codes, e.g.:
       https://<base_domain>/<lang_code>/<path>
    Also sets x-default to English by default.
    """

    # Encode the path for safety
    encoded_path = urllib.parse.quote(path, safe="/:?=&%")

    # We'll set x-default to the English version
    xdefault_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
    xdefault_link.set('rel', 'alternate')
    xdefault_link.set('hreflang', 'x-default')
    xdefault_link.set('href', f"https://{base_domain}/en/{encoded_path}")

    # Add one <xhtml:link> for each language
    for lang_code in languages.values():
        alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
        alt_link.set('rel', 'alternate')
        alt_link.set('hreflang', lang_code)
        alt_link.set('href', f"https://{base_domain}/{lang_code}/{encoded_path}")

# --------------------------------------------------------------------
# 3) Main logic
# --------------------------------------------------------------------
def main():
    print("**Fetching SUMMARY files**...")
    book_summary = fetch_summary(SUMMARY_URL_BOOK)
    cloud_summary = fetch_summary(SUMMARY_URL_CLOUD)

    print("**Extracting paths from summaries**...")
    book_paths = parse_paths_from_summary(book_summary)
    cloud_paths = parse_paths_from_summary(cloud_summary)

    # Prepare the output sitemap root
    ET.register_namespace('', "http://www.sitemaps.org/schemas/sitemap/0.9")
    ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
    root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')

    print("**Processing Book paths**...")
    for p in tqdm(book_paths, desc="Book paths"):
        url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')

        # Encode path to handle special chars like '+'
        encoded_path = urllib.parse.quote(p, safe="/:?=&%")

        # Base location: domain/en/encoded_path
        loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
        full_en_url = f"https://{BOOK_DOMAIN}/en/{encoded_path}"
        loc_el.text = full_en_url

        # Priority calculation
        priority_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
        priority_el.text = f"{compute_priority_from_depth(p):.2f}"

        # Add translations
        add_translated_urls(url_element, BOOK_DOMAIN, p)
        root.append(url_element)

    print("**Processing Cloud paths**...")
    for p in tqdm(cloud_paths, desc="Cloud paths"):
        url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')

        encoded_path = urllib.parse.quote(p, safe="/:?=&%")

        # Base location: domain/en/encoded_path
        loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
        full_en_url = f"https://{CLOUD_DOMAIN}/en/{encoded_path}"
        loc_el.text = full_en_url

        # Priority calculation
        priority_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
        priority_el.text = f"{compute_priority_from_depth(p):.2f}"

        # Add translations
        add_translated_urls(url_element, CLOUD_DOMAIN, p)
        root.append(url_element)

    print("**Generating final sitemap**...")
    sitemap_xml = prettify_xml(root)
    with open("sitemap.xml", "w", encoding="utf-8") as f:
        f.write(sitemap_xml)

    print("**sitemap.xml has been successfully generated.**")

if __name__ == "__main__":
    main()