-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
68 lines (52 loc) · 1.93 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from amcat4py import AmcatClient
import requests
import cssselect
import argparse
import amcat4py
import re
from lxml import html
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
def get_links(url):
while True:
r = requests.get(url, headers=headers)
r.raise_for_status()
tree = html.fromstring(r.text)
hrefs = tree.cssselect("div.category- a")
for href in hrefs:
ref = href.get("href")
yield dict(
url = ref
)
onclick_attribute = tree.cssselect('button[onclick]')[0].get('onclick')
next_url = re.search(r"loadMore\.GetHtml\(this, 'ArticleRow', '(.+?)'\);", onclick_attribute).group(1)
url = f"https://www.rodi.nl/denhaag/components/html?component=ArticleRow&key={next_url}"
def get_article(art):
url = f"https://www.rodi.nl{art}"
if url.startswith("https://www.rodi.nl/denhaag/nieuws/"):
r = requests.get(url, headers=headers)
r.raise_for_status()
tree = html.fromstring(r.text)
title = tree.cssselect("article h1")[0].text_content()
date = tree.cssselect('time')[0].get('datetime')
text = tree.cssselect("article p")
text2 = "\n\n".join(t.text_content().strip() for t in text)
yield dict(
url = url,
title=title,
date=date,
text=text2,
publisher = "Rodi Den Haag"
)
else:
return
parser = argparse.ArgumentParser()
parser.add_argument("server", help="AmCAT host name",)
parser.add_argument("index", help="AmCAT index")
args = parser.parse_args()
conn = AmcatClient(args.server)
url = "https://www.rodi.nl/denhaag"
for doc in get_links(url):
url = doc['url']
arts = get_article(url)
for art in arts:
conn.upload_documents(args.index, [art])