fix: Do not use webdriver_manager

alexandrainst · Oct 1, 2024 · 170550f · 170550f
1 parent 9b943a1
commit 170550f
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 48 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,6 @@ datasets = "^2.14.6"
 nltk = "^3.8.1"
 beautifulsoup4 = "^4.12.2"
 selenium = "^4.15.0"
-webdriver-manager = "^4.0.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.2"

diff --git a/src/tts_text/borger_dk.py b/src/tts_text/borger_dk.py
@@ -5,8 +5,6 @@
 from bs4 import Tag
 from omegaconf import DictConfig
 from tqdm.auto import tqdm
-from webdriver_manager.chrome import ChromeDriverManager
-import logging
 import re
 from .utils import extract_sentences, get_soup
 
@@ -35,10 +33,6 @@ def build_borger_dk_dataset(cfg: DictConfig) -> list[str]:
         with dataset_path.open("r", encoding="utf-8") as f:
             return f.read().split("\n")
 
-    # Install the Chrome driver, if it isn't already installed
-    logging.getLogger("WDM").setLevel(logging.WARNING)
-    ChromeDriverManager().install()
-
     # Get the overall categories from the front page
     soup = get_soup(url=BASE_URL, dynamic=True)
 

diff --git a/src/tts_text/sundhed_dk.py b/src/tts_text/sundhed_dk.py
@@ -6,8 +6,6 @@
 from bs4 import Tag
 from omegaconf import DictConfig
 from tqdm.auto import tqdm
-from webdriver_manager.chrome import ChromeDriverManager
-import logging
 import re
 from tqdm.contrib.concurrent import process_map
 import multiprocessing as mp
@@ -32,10 +30,6 @@ def build_sundhed_dk_dataset(cfg: DictConfig) -> list[str]:
         with dataset_path.open("r", encoding="utf-8") as f:
             return f.read().split("\n")
 
-    # Install the Chrome driver, if it isn't already installed
-    logging.getLogger("WDM").setLevel(logging.WARNING)
-    ChromeDriverManager().install()
-
     # Get the overall categories from the front page
     soup = get_soup(
         url=BASE_URL + "/borger/patienthaandbogen/",

diff --git a/src/tts_text/utils.py b/src/tts_text/utils.py
@@ -12,7 +12,6 @@
 import re
 import requests as rq
 from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import TimeoutException, WebDriverException
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
@@ -168,8 +167,24 @@ def get_soup(
 
     html: str = ""
     if dynamic:
-        options = Options()
-        options.add_argument("--headless")
+        options = webdriver.ChromeOptions()
+        chrome_arguments = [
+            "--no-sandbox",
+            "--remote-debugging-port=9222",
+            "--remote-debugging-pipe",
+            "--autoplay-policy=no-user-gesture-required",
+            "--no-first-run",
+            "--disable-gpu",
+            "--use-fake-ui-for-media-stream",
+            "--use-fake-device-for-media-stream",
+            "--disable-sync",
+            "--disable-dev-shm-usage",
+            "--start-maximized",
+            "--headless=new",
+        ]
+        for argument in chrome_arguments:
+            options.add_argument(argument=argument)
+
         driver = webdriver.Chrome(options=options)
         retries_left = 5
         while retries_left > 0 and not html: