Skip to content

Commit

Permalink
fix: Do not use webdriver_manager
Browse files Browse the repository at this point in the history
  • Loading branch information
saattrupdan committed Oct 1, 2024
1 parent 9b943a1 commit 170550f
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 48 deletions.
35 changes: 3 additions & 32 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ datasets = "^2.14.6"
nltk = "^3.8.1"
beautifulsoup4 = "^4.12.2"
selenium = "^4.15.0"
webdriver-manager = "^4.0.1"

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.2"
Expand Down
6 changes: 0 additions & 6 deletions src/tts_text/borger_dk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
from bs4 import Tag
from omegaconf import DictConfig
from tqdm.auto import tqdm
from webdriver_manager.chrome import ChromeDriverManager
import logging
import re
from .utils import extract_sentences, get_soup

Expand Down Expand Up @@ -35,10 +33,6 @@ def build_borger_dk_dataset(cfg: DictConfig) -> list[str]:
with dataset_path.open("r", encoding="utf-8") as f:
return f.read().split("\n")

# Install the Chrome driver, if it isn't already installed
logging.getLogger("WDM").setLevel(logging.WARNING)
ChromeDriverManager().install()

# Get the overall categories from the front page
soup = get_soup(url=BASE_URL, dynamic=True)

Expand Down
6 changes: 0 additions & 6 deletions src/tts_text/sundhed_dk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from bs4 import Tag
from omegaconf import DictConfig
from tqdm.auto import tqdm
from webdriver_manager.chrome import ChromeDriverManager
import logging
import re
from tqdm.contrib.concurrent import process_map
import multiprocessing as mp
Expand All @@ -32,10 +30,6 @@ def build_sundhed_dk_dataset(cfg: DictConfig) -> list[str]:
with dataset_path.open("r", encoding="utf-8") as f:
return f.read().split("\n")

# Install the Chrome driver, if it isn't already installed
logging.getLogger("WDM").setLevel(logging.WARNING)
ChromeDriverManager().install()

# Get the overall categories from the front page
soup = get_soup(
url=BASE_URL + "/borger/patienthaandbogen/",
Expand Down
21 changes: 18 additions & 3 deletions src/tts_text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import re
import requests as rq
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
Expand Down Expand Up @@ -168,8 +167,24 @@ def get_soup(

html: str = ""
if dynamic:
options = Options()
options.add_argument("--headless")
options = webdriver.ChromeOptions()
chrome_arguments = [
"--no-sandbox",
"--remote-debugging-port=9222",
"--remote-debugging-pipe",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--disable-gpu",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
"--disable-dev-shm-usage",
"--start-maximized",
"--headless=new",
]
for argument in chrome_arguments:
options.add_argument(argument=argument)

driver = webdriver.Chrome(options=options)
retries_left = 5
while retries_left > 0 and not html:
Expand Down

0 comments on commit 170550f

Please sign in to comment.