Skip to content

Commit 679c127

Browse files
committed
Fix hierarchy_radio error when scraping
1 parent f07889e commit 679c127

File tree

4 files changed

+23
-9
lines changed

4 files changed

+23
-9
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ geckodriver.log
2020

2121
configs/private
2222
/typesense-server-data/
23+
typesense-data/

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ Remember to change the version numbers in the URL as needed.
3737

3838
This section only applies if you're making changes to this scraper itself. If you only need to run the scraper, see Usage instructions above.
3939

40+
#### Running the code locally
41+
42+
```shellsession
43+
$ pipenv shell
44+
$ ./docsearch run configs/public/typesense_docs.json
45+
```
46+
4047
#### Releasing a new version
4148

4249
Basic/abbreviated instructions:

configs/public/typesense_docs.json

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"url": "https://typesense.org/docs/(?P<version>.*?)/",
66
"variables": {
77
"version": [
8-
"0.21.0"
8+
"27.1"
99
]
1010
}
1111
}
@@ -22,9 +22,13 @@
2222
},
2323
"scrape_start_urls": false,
2424
"strip_chars": " .,;:#",
25-
"nb_hits": 505,
2625
"custom_settings": {
27-
"token_separators": ["_"],
28-
"symbols_to_index": ["*"]
29-
}
26+
"token_separators": [
27+
"_"
28+
],
29+
"symbols_to_index": [
30+
"*"
31+
]
32+
},
33+
"nb_hits": 16502
3034
}

scraper/src/typesense_helper.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,10 +209,12 @@ def transform_record(record):
209209

210210
# Flatten nested hierarchy fields
211211
for x in range(0, 7):
212-
if record['hierarchy'][f'lvl{x}'] is not None:
213-
transformed_record[f'hierarchy.lvl{x}'] = record['hierarchy'][f'lvl{x}']
214-
if record['hierarchy_radio'][f'lvl{x}'] is not None:
215-
transformed_record[f'hierarchy_radio.lvl{x}'] = record['hierarchy_radio'][f'lvl{x}']
212+
if 'hierarchy' in record and f'lvl{x}' in record['hierarchy']:
213+
if record['hierarchy'][f'lvl{x}'] is not None:
214+
transformed_record[f'hierarchy.lvl{x}'] = record['hierarchy'][f'lvl{x}']
215+
if 'hierarchy_radio' in record and f'lvl{x}' in record['hierarchy_radio']:
216+
if record['hierarchy_radio'][f'lvl{x}'] is not None:
217+
transformed_record[f'hierarchy_radio.lvl{x}'] = record['hierarchy_radio'][f'lvl{x}']
216218

217219
# Convert version to array
218220
if 'version' in record and type(record['version']) == str:

0 commit comments

Comments
 (0)