-
Notifications
You must be signed in to change notification settings - Fork 0
/
genTestData.mjs
115 lines (104 loc) · 2.37 KB
/
genTestData.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import * as tldr from 'wikipedia-tldr'
const { default: getArticle } = tldr
import { readFileSync, writeFileSync } from 'fs'
async function delayedGetArticle(articleName, delay) {
console.log('Fetching article: ' + articleName)
await new Promise((resolve) => setTimeout(resolve, delay))
return await getArticle(articleName)
}
function getRandomIndex(arr) {
return Math.floor(Math.random() * arr.length)
}
const articleNames = [
'neuroscience',
'artificial_intelligence',
'genetics',
'biochemistry',
'nanotechnology',
'ecology',
'climate_change',
'cognitive_science',
'epistemology',
'ethics',
'linguistics',
'semiotics',
'statistics',
'calculus',
'geometry',
'cybernetics',
'telecommunication',
'robotics',
'astronomy',
'cosmology',
'information',
'data',
'computer',
'technology',
'somatology',
'sociology',
'psychology',
'philosophy',
'biology',
'anatomy',
'physiology',
'medicine',
'health',
'disease',
'death',
'life',
'human',
'animal',
'plant',
'cell',
'molecule',
'atom',
'particle',
'quantum',
'physics',
'chemistry',
'mathematics',
'logic',
'language',
'communication',
]
const run = async () => {
const wikipediaArticles = []
for (let i = getRandomIndex(articleNames); i < articleNames.length; i++) {
if (!articleNames[i]) break
wikipediaArticles.push(await delayedGetArticle(articleNames[i], 500))
}
const storedArticles = JSON.parse(readFileSync('wikipediaArticles.json', 'utf8'))
const data = wikipediaArticles
.map((article) =>
article
? {
term: article.query,
text: article.extract,
metadata: {
id: article.wikibase_item,
title: article.title,
lang: article.lang,
},
}
: null,
)
.filter((article) => article)
storedArticles.push(...data)
function removeDuplicatesByTerm(arr) {
const seenTerms = new Map()
return arr.filter((item) => {
if (seenTerms.has(item.term)) {
return false
} else {
seenTerms.set(item.term, true)
return true
}
})
}
const uniqueArticles = removeDuplicatesByTerm(storedArticles)
writeFileSync('wikipediaArticles.json', JSON.stringify(uniqueArticles, null, 2))
if (uniqueArticles.length !== storedArticles.length) {
await run()
}
}
await run()