forked from SciVault/sciuploader
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path0_generate_basic_metadata.js
More file actions
100 lines (89 loc) · 3.29 KB
/
0_generate_basic_metadata.js
File metadata and controls
100 lines (89 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
const fs = require('fs').promises;
const path = require('path');
async function walkDir(dir) {
try {
const files = await fs.readdir(dir);
const jsonFiles = files.filter(file => file.endsWith('.json'));
return jsonFiles.map(file => path.join(dir, file));
} catch (error) {
console.error('Error reading directory:', error);
throw error;
}
}
function extractAbstract(paper) {
// Try to reconstruct abstract from inverted index if available
if (paper.openalex?.abstract_inverted_index) {
const words = [];
const index = paper.openalex.abstract_inverted_index;
const maxPosition = Math.max(...Object.values(index).flat());
for (let i = 0; i <= maxPosition; i++) {
for (const [word, positions] of Object.entries(index)) {
if (positions.includes(i)) {
words[i] = word;
break;
}
}
}
return words.join(' ');
}
return ""; // Return empty string if no abstract found
}
function extractBasicMetadata(paper) {
return {
abstract: extractAbstract(paper),
title: paper.openalex?.title ||
paper.crossref?.title?.[0] ||
"",
authors: paper.openalex?.authorships
?.map(a => a.raw_author_name)
.join(", ") ||
paper.crossref?.author
?.map(a => `${a.given} ${a.family}`)
.join(", ") ||
"",
doi: paper.doi || "",
aid: paper.openalex?.id?.split("/").pop() ||
paper.crossref?.DOI?.replace(/[^a-zA-Z0-9]/g, "") ||
""
};
}
async function generateBasicMetadata(metadataDir) {
try {
// Get all JSON files in the directory
const files = await walkDir(metadataDir);
// Process each file
const metadata = [];
for (const file of files) {
try {
console.log(`Processing file: ${file}`); // Add logging
const content = await fs.readFile(file, 'utf8');
const paper = JSON.parse(content.trim()); // Add trim() to remove any BOM or whitespace
const basicMetadata = extractBasicMetadata(paper);
metadata.push(basicMetadata);
} catch (error) {
console.error(`Error processing file ${file}:`, error);
// Continue with next file instead of stopping
continue;
}
}
// Write the results to a file
const outputPath = path.join(process.cwd(), 'basic_metadata.json');
await fs.writeFile(
outputPath,
JSON.stringify(metadata, null, 2)
);
console.log(`Basic metadata generated and saved to ${outputPath}`);
console.log(`Processed ${metadata.length} files successfully`);
return metadata;
} catch (error) {
console.error('Error generating basic metadata:', error);
throw error;
}
}
// Export the function if using as a module
module.exports = generateBasicMetadata;
// If running directly
if (require.main === module) {
const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata');
generateBasicMetadata(metadataDir).catch(console.error);
}