From 1e5c6488ff792456c9abd17f048e30d14cc5ae5b Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Dec 2023 12:48:14 +0800 Subject: [PATCH 01/37] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 5f1bdf901..c90394a30 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 021a83f3e..a6eff4063 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 4109c49fc..f9a2f50c8 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b47ae3614..e68385967 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 08e70c161..7361216f2 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 4a2b358d0..bff1de3f6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 92914655a..c81c5613b 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 5c2e50b2a..8381c0275 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 From 7ededbea1a3b040c4429293e10a30996ccf9caf0 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Dec 2023 12:56:14 +0800 Subject: [PATCH 02/37] Fix javadoc. --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 17f8b03dd..b4c161a9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -71,6 +71,7 @@ public static Page fail() { * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, * and {@link #request} is specified. * + * @param request the {@link Request}. * @return the page. * @since 0.10.0 */ From 95d1f4415039942d8c6799d172d710afb2102dd2 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 3 Mar 2024 18:33:00 +0800 Subject: [PATCH 03/37] Optimize Request#extras, fix #1148. --- .../java/us/codecraft/webmagic/Request.java | 18 +++++------ .../us/codecraft/webmagic/RequestTest.java | 32 +++++++++++++++++-- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9fc286192..a59b20637 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,13 +1,14 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.downloader.Downloader; -import us.codecraft.webmagic.model.HttpRequestBody; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.Collections; import java.util.HashMap; import java.util.Map; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.model.HttpRequestBody; +import us.codecraft.webmagic.utils.Experimental; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -35,7 +36,7 @@ public class Request implements Serializable { /** * Store additional information in extras. */ - private Map extras; + private Map extras = new HashMap<>(); /** * cookies for current url, if not set use Site's cookies @@ -93,9 +94,6 @@ public T getExtra(String key) { } public Request putExtra(String key, T value) { - if (extras == null) { - extras = new HashMap(); - } extras.put(key, value); return this; } @@ -105,11 +103,11 @@ public String getUrl() { } public Map getExtras() { - return extras; + return Collections.unmodifiableMap(extras); } public Request setExtras(Map extras) { - this.extras = extras; + this.extras.putAll(extras); return this; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java index c7e4943d9..b8f699a6f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Map; + import org.junit.Test; -import us.codecraft.webmagic.utils.HttpConstant; -import static org.assertj.core.api.Assertions.assertThat; +import us.codecraft.webmagic.utils.HttpConstant; /** * @author code4crafter@gmail.com @@ -22,4 +26,28 @@ public void testEqualsAndHashCode() throws Exception { assertThat(requestA).isNotEqualTo(requestB); assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode()); } + + @Test + public void testSetExtras() { + Request request = new Request(); + Map extras = Collections.singletonMap("a", "1"); + request.setExtras(extras); + request.putExtra("b", "2"); + assertThat(request.getExtra("a")).isEqualTo("1"); + assertThat(request.getExtra("b")).isEqualTo("2"); + } + + @Test + public void testGetExtras() { + Request request = new Request(); + request.putExtra("a", "1"); + assertThat(request.getExtras()).containsEntry("a", "1"); + } + + @Test(expected = UnsupportedOperationException.class) + public void testGetExtrasShouldBeUnmodifiable() { + Request request = new Request(); + request.getExtras().put("a", "1"); + } + } From e4ab6e27e4fd127d1feeea862a7ff10eb91c2ae7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 3 Mar 2024 18:35:25 +0800 Subject: [PATCH 04/37] Optimize Request#extras, refs #1148. --- .../us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java | 2 +- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 46d47e5a5..7abe5bfad 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -102,7 +102,7 @@ private String getZsetMinusPriorityKey(Task task) { } private void setExtrasInItem(Jedis jedis,Request request, Task task) { - if (request.getExtras() != null) { + if (!request.getExtras().isEmpty()) { String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 19e831321..8d61bea3b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -84,7 +84,7 @@ private boolean checkForAdditionalInfo(Request request) { return true; } - if (request.getExtras() != null && !request.getExtras().isEmpty()) { + if (!request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { From 22a60df6aa06d8c73642b2d4c9f839d74bbb7f0f Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 3 Mar 2024 20:02:09 +0800 Subject: [PATCH 05/37] Fix build for selenium upgrading from 3.141.59 to 4.14.1, refs #1134. --- .../downloader/selenium/WebDriverPool.java | 33 ++++++++++--------- .../webmagic/downloader/SeleniumTest.java | 13 ++++---- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index e1d9dd039..b96d2894b 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,15 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.firefox.FirefoxDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.openqa.selenium.remote.DesiredCapabilities; -import org.openqa.selenium.remote.RemoteWebDriver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; @@ -22,6 +12,18 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -58,7 +60,7 @@ class WebDriverPool { * Configure the GhostDriver, and initialize a WebDriver instance. This part * of code comes from GhostDriver. * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver - * + * * @author bob.li.0718@gmail.com * @throws IOException */ @@ -73,7 +75,6 @@ public void configure() throws IOException { // Prepare capabilities sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); sCaps.setCapability("takesScreenshot", false); String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); @@ -134,9 +135,9 @@ public void configure() throws IOException { sCaps.setBrowserName("phantomjs"); mDriver = new RemoteWebDriver(new URL(driver), sCaps); } else if (driver.equals(DRIVER_FIREFOX)) { - mDriver = new FirefoxDriver(sCaps); + mDriver = new FirefoxDriver(new FirefoxOptions(sCaps)); } else if (driver.equals(DRIVER_CHROME)) { - mDriver = new ChromeDriver(sCaps); + mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps)); } else if (driver.equals(DRIVER_PHANTOMJS)) { mDriver = new PhantomJSDriver(sCaps); } @@ -144,7 +145,7 @@ public void configure() throws IOException { /** * check whether input is a valid URL - * + * * @author bob.li.0718@gmail.com * @param urlString urlString * @return true means yes, otherwise no. @@ -178,7 +179,7 @@ public WebDriverPool() { } /** - * + * * @return * @throws InterruptedException */ diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java index b7bcd80b3..43ac84b5a 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -1,17 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.DesiredCapabilities; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -29,10 +30,10 @@ public void testSelenium() { Map preferences = new HashMap(); preferences.put("profile.default_content_settings", contentSettings); - DesiredCapabilities caps = DesiredCapabilities.chrome(); + DesiredCapabilities caps = new DesiredCapabilities(); caps.setCapability("chrome.prefs", preferences); caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); - WebDriver webDriver = new ChromeDriver(caps); + WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps)); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML")); From 7f8607b88130daf1814abf8a6792d6433d23bf57 Mon Sep 17 00:00:00 2001 From: Ch3n4y Date: Thu, 7 Mar 2024 08:41:26 +0800 Subject: [PATCH 06/37] update com.fasterxml.jackson.core:jackson-databind 2.15.2 to 2.16.0 (#1149) --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 7361216f2..ad7fae4ce 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.15.2 + 2.16.0 From 80842d72db35b22d93e34d7773251c9bec9a9de9 Mon Sep 17 00:00:00 2001 From: ayushi250317 <157420261+ayushi250317@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:22:30 -0300 Subject: [PATCH 07/37] Added test cases for improving line and branch coverage (#1150) * Initial Commit * Assignment 1 Submission --- .../java/us/codecraft/webmagic/SiteTest.java | 23 ++++++++ .../downloader/HttpClientDownloaderTest.java | 9 +++ .../webmagic/selector/AndSelectorTest.java | 59 +++++++++++++++++++ .../webmagic/selector/CssSelectorTest.java | 39 ++++++++++++ .../webmagic/selector/OrSelectorTest.java | 44 ++++++++++++++ 5 files changed, 174 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java index 783b82ddc..47c4fcc14 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -1,8 +1,12 @@ package us.codecraft.webmagic; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.junit.Test; @@ -14,4 +18,23 @@ public void test() { assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); } + @Test + public void addCookieTest(){ + Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + site.addCookie("cookieDefault","cookie-webmagicDefault"); + String firstDomain="example.com"; + String secondDomain="exampleCopy.com"; + site.addCookie(firstDomain, "cookie", "cookie-webmagic"); + site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy"); + site.addCookie(secondDomain, "cookie", "cookie-webmagic"); + Map> allCookies = site.getAllCookies(); + List domains=new ArrayList<>(); + for(String key : allCookies.keySet()){ + domains.add(key); + } + assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie")); + assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy")); + assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie")); + assertEquals(2, domains.size()); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 780ca7529..1ff7b4dd7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -40,6 +40,7 @@ import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** @@ -333,5 +334,13 @@ public void run() throws Exception { }); } + @Test + public void test_no_task_download(){ + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null)); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java new file mode 100644 index 000000000..59885ebd1 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class AndSelectorTest { + + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item1']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals("
\n Item 1\n
", result.get(0)); + } + + @Test + public void testSelectList_NoResults() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals(0, result.size()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java new file mode 100644 index 000000000..8b1ace903 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.runners.MockitoJUnitRunner; + +import java.util.List; +import static org.junit.Assert.*; + +public class CssSelectorTest { + + @Test + public void testSelectElement() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + Element resultElement = cssSelector.selectElement(dummyElement); + assertNotNull(resultElement); + } + + @Test + public void testSelectList() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + List result = cssSelector.selectList(dummyElement); + assertEquals(1, result.size()); + assertEquals("[
\n Hello World!\n
]", result.toString()); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java new file mode 100644 index 000000000..24d87647c --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class OrSelectorTest { + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + String expectedResult = "[\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + ",
\n" + + " Item 1\n" + + "
,
\n" + + " Item 2\n" + + "
]"; + List selectors = new ArrayList(); + selectors.add(new CssSelector("head")); + selectors.add(new XpathSelector("//div[@class='item1']")); + selectors.add(new XpathSelector("//div[@class='item2']")); + OrSelector orSelector = new OrSelector(selectors); + List result = orSelector.selectList(htmlContent); + assertEquals(expectedResult, result.toString()); + } +} From 28ac8bf9c433b492fca5f241fa205674285ad87d Mon Sep 17 00:00:00 2001 From: ayushi250317 <157420261+ayushi250317@users.noreply.github.com> Date: Thu, 28 Mar 2024 13:45:12 -0300 Subject: [PATCH 08/37] Refactored Code to Resolve Implementation Code Smells (#1151) * Initial Commit * Assignment 1 Submission * Resolving Implementation Smells --- .../downloader/HttpUriRequestConverter.java | 4 +- .../webmagic/model/PageModelExtractor.java | 104 ++++++++++-------- 2 files changed, 60 insertions(+), 48 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 4baaf4a4a..168467866 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -42,7 +42,9 @@ private HttpClientContext convertHttpClientContext(Request request, Site site, P HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); - authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY); + UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()); + authState.update(proxyAuthScheme, proxyCredentials); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 1e25a46c0..d8947ded6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -234,63 +234,23 @@ private Object processSingle(Page page, String html, boolean isRaw) { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { if (fieldExtractor.isMulti()) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } + List value=getMultiValueFromSource(page, fieldExtractor, html, isRaw); if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { return null; } if (fieldExtractor.getObjectFormatter() != null) { - List converted = convert(value, fieldExtractor.getObjectFormatter()); + List converted = convertMultiValue(value, fieldExtractor.getObjectFormatter()); setField(o, fieldExtractor, converted); } else { setField(o, fieldExtractor, value); } } else { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } + String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw); if (value == null && fieldExtractor.isNotNull()) { return null; } if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convert(value, fieldExtractor.getObjectFormatter()); + Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter()); if (converted == null && fieldExtractor.isNotNull()) { return null; } @@ -313,7 +273,57 @@ private Object processSingle(Page page, String html, boolean isRaw) { return o; } - private Object convert(String value, ObjectFormatter objectFormatter) { + private List getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { + List value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) { + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().selectList(html); + } + break; + case Url: + value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + case RawText: + value = fieldExtractor.getSelector().selectList(page.getRawText()); + break; + default: + value = fieldExtractor.getSelector().selectList(html); + } + return value; + } + + private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { + String value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) { + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().select(html); + } + break; + case Url: + value = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + case RawText: + value = fieldExtractor.getSelector().select(page.getRawText()); + break; + default: + value = fieldExtractor.getSelector().select(html); + } + return value; + } + + private Object convertSingleValue(String value, ObjectFormatter objectFormatter) { try { Object format = objectFormatter.format(value); logger.debug("String {} is converted to {}", value, format); @@ -324,10 +334,10 @@ private Object convert(String value, ObjectFormatter objectFormatter) { return null; } - private List convert(List values, ObjectFormatter objectFormatter) { + private List convertMultiValue(List values, ObjectFormatter objectFormatter) { List objects = new ArrayList(); for (String value : values) { - Object converted = convert(value, objectFormatter); + Object converted = convertSingleValue(value, objectFormatter); if (converted != null) { objects.add(converted); } From 9b9f173c1c356d2f2c9ca1c33339e459f37501c5 Mon Sep 17 00:00:00 2001 From: ayushi250317 <157420261+ayushi250317@users.noreply.github.com> Date: Sat, 30 Mar 2024 03:26:41 -0300 Subject: [PATCH 09/37] Refactored Code to increase maintainability (#1152) * Initial Commit * Assignment 1 Submission * Resolving Implementation Smells * Refactoring Code to increase maintainability --- .../java/us/codecraft/webmagic/Spider.java | 72 ++++------------ .../codecraft/webmagic/SpiderScheduler.java | 59 +++++++++++++ .../codecraft/webmagic/selector/HtmlNode.java | 1 - .../webmagic/selector/PlainText.java | 5 -- .../webmagic/selector/Selectable.java | 8 -- .../model/formatter/BasicClassDetector.java | 85 +++++++++++++++++++ .../model/formatter/BasicTypeFormatter.java | 31 +++---- 7 files changed, 174 insertions(+), 87 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9f9201ee3..11a671f7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,11 +9,8 @@ import java.util.List; import java.util.UUID; import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; @@ -75,9 +72,9 @@ public class Spider implements Runnable, Task { protected Site site; protected String uuid; - - protected Scheduler scheduler = new QueueScheduler(); - + + protected SpiderScheduler scheduler; + protected Logger logger = LoggerFactory.getLogger(getClass()); protected CountableThreadPool threadPool; @@ -100,10 +97,6 @@ public class Spider implements Runnable, Task { protected boolean destroyWhenExit = true; - private ReentrantLock newUrlLock = new ReentrantLock(); - - private Condition newUrlCondition = newUrlLock.newCondition(); - private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); @@ -131,6 +124,7 @@ public static Spider create(PageProcessor pageProcessor) { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); + this.scheduler = new SpiderScheduler(new QueueScheduler()); } /** @@ -186,15 +180,15 @@ public Spider scheduler(Scheduler scheduler) { /** * set scheduler for Spider * - * @param scheduler scheduler + * @param updateScheduler scheduler * @return this * @see Scheduler * @since 0.2.1 */ - public Spider setScheduler(Scheduler scheduler) { + public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - Scheduler oldScheduler = this.scheduler; - this.scheduler = scheduler; + SpiderScheduler oldScheduler = this.scheduler; + scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { @@ -213,7 +207,7 @@ public Spider setScheduler(Scheduler scheduler) { * @deprecated */ @Deprecated - public Spider pipeline(Pipeline pipeline) { + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -264,7 +258,7 @@ public Spider clearPipeline() { * @deprecated */ @Deprecated - public Spider downloader(Downloader downloader) { + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } @@ -333,10 +327,10 @@ public void run() { } } else { // wait until new url added, - if (waitNewUrl()) { - //if interrupted + if (scheduler.waitNewUrl(threadPool, emptySleepTime)) { + // if interrupted break; - } + } continue; } } @@ -353,7 +347,7 @@ public void run() { logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); - signalNewUrl(); + scheduler.signalNewUrl(); } } }); @@ -536,7 +530,7 @@ public Spider addUrl(String... urls) { for (String url : urls) { addRequest(new Request(url)); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } @@ -588,42 +582,10 @@ public Spider addRequest(Request... requests) { for (Request request : requests) { addRequest(request); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } - /** - * - * @return isInterrupted - */ - private boolean waitNewUrl() { - // now there may not be any thread live - newUrlLock.lock(); - try { - //double check,unnecessary, unless very fast concurrent - if (threadPool.getThreadAlive() == 0) { - return false; - } - //wait for amount of time - newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); - return false; - } catch (InterruptedException e) { - // logger.warn("waitNewUrl - interrupted, error {}", e); - return true; - } finally { - newUrlLock.unlock(); - } - } - - private void signalNewUrl() { - try { - newUrlLock.lock(); - newUrlCondition.signalAll(); - } finally { - newUrlLock.unlock(); - } - } - public void start() { runAsync(); } @@ -799,7 +761,7 @@ public Date getStartTime() { } public Scheduler getScheduler() { - return scheduler; + return scheduler.getScheduler(); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java new file mode 100644 index 000000000..1005bac88 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.thread.CountableThreadPool; + +public class SpiderScheduler { + private Scheduler scheduler; + private final ReentrantLock newUrlLock = new ReentrantLock(); + private final Condition newUrlCondition = newUrlLock.newCondition(); + + public SpiderScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Scheduler getScheduler() { + return scheduler; + } + + public void setScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Request poll(Spider spider) { + return scheduler.poll(spider); + } + + public void push(Request request, Spider spider) { + scheduler.push(request, spider); + } + + public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) { + newUrlLock.lock(); + try { + if (threadPool.getThreadAlive() == 0) { + return false; + } + newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; + } catch (InterruptedException e) { + return true; + } finally { + newUrlLock.unlock(); + } + } + + public void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index c063b4825..85ff5fa69 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -26,7 +26,6 @@ protected List getElements() { return elements; } - @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, getSourceTexts()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c78f6791b..18258e9a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -42,11 +42,6 @@ public Selectable xpath(String xpath) { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } - @Override - public Selectable smartContent() { - throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); - } - @Override public Selectable links() { throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 9412cfce4..a4d5fdb94 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -51,14 +51,6 @@ public interface Selectable { * @return new Selectable after extract */ public Selectable css(String selector, String attrName); - - /** - * select smart content with ReadAbility algorithm - * - * @return content - */ - public Selectable smartContent(); - /** * select all links * diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java new file mode 100644 index 000000000..f03b8864a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.model.formatter; + +public interface BasicClassDetector { + Class detectBasicClass(Class type); +} + +class IntegerClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } + return null; + } +} + +class LongClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } + return null; + } +} + +class DoubleClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } + return null; + } +} + +class FloatClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } + return null; + } +} + +class ShortClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } + return null; + } +} + +class CharacterClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } + return null; + } +} + +class ByteClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } + return null; + } +} + +class BooleanClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return null; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index f9d76a845..2d4d85b0a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -24,28 +24,24 @@ public T format(String raw) throws Exception { } protected abstract T formatTrimmed(String raw) throws Exception; - public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); + public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(), + new LongClassDetector(), + new FloatClassDetector(), + new DoubleClassDetector(), + new ShortClassDetector(), + new ByteClassDetector(), + new BooleanClassDetector(), + new CharacterClassDetector()); public static Class detectBasicClass(Class type) { - if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { - return Integer.class; - } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { - return Long.class; - } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { - return Double.class; - } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { - return Float.class; - } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { - return Short.class; - } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { - return Character.class; - } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { - return Byte.class; - } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { - return Boolean.class; + for (BasicClassDetector detector : basicClassDetector) { + Class detectedClass = detector.detectBasicClass(type); + if (detectedClass != null) { + return detectedClass; + } } return type; } @@ -146,5 +142,4 @@ public Class clazz() { } } - } From f051d978e2f329de8f30455e6ab658789e328f1c Mon Sep 17 00:00:00 2001 From: Parthgajera056 <149322319+Parthgajera056@users.noreply.github.com> Date: Sat, 30 Mar 2024 03:28:02 -0300 Subject: [PATCH 10/37] Refactored code for increased optimization. (#1139) * refactoring by decompose conditional technique * refactoring by introduction explaining variable technique * refactoring by rename method/variable technique * refactoring by introducing explaining variable technique * Added Extract class refactoring to increase maintainablilty * Refactoring using replace conditional with polymorphism --- .../main/java/us/codecraft/webmagic/Page.java | 19 ++++-- .../downloader/HttpClientGenerator.java | 3 +- .../webmagic/model/HttpRequestBody.java | 2 +- .../webmagic/selector/ElementsUtil.java | 53 ++++++++++++++++ .../codecraft/webmagic/selector/HtmlNode.java | 63 +++---------------- .../webmagic/configurable/ExtractRule.java | 27 ++++---- .../configurable/SelectorFactory.java | 57 +++++++++++++++++ 7 files changed, 150 insertions(+), 74 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b4c161a9a..dc87ece87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -169,18 +169,25 @@ public void addTargetRequests(Iterable requests, long priority) { * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { - if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { - return; + boolean isBlankUrl = StringUtils.isBlank(url); + boolean isHashSymbol = url.equals("#"); + boolean isJavaScript = url.startsWith("javascript:"); + + if (isBlankUrl || isHashSymbol || isJavaScript) { + return; // Invalid URL, so no further processing is needed. } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); - Request req = new Request(canonicalizedUrl); - if(priority > 0) { - req.setPriority(priority); + Request request = new Request(canonicalizedUrl); + + if (priority > 0) { + request.setPriority(priority); } - targetRequests.add(req); + + targetRequests.add(request); } + /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 167a5e1c6..f32a4eba8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -40,13 +40,14 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; + private static final int DEFAULT_MAX_PER_ROUTE = 100; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setDefaultMaxPerRoute(100); + connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b30785..23606d86a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,7 +64,7 @@ public void setEncoding(String encoding) { this.encoding = encoding; } - public static HttpRequestBody json(String json, String encoding) { + public static HttpRequestBody createJsonRequestBody(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java new file mode 100644 index 000000000..10873c710 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; + +public class ElementsUtil { + HtmlNode htmlNode = new HtmlNode(); + public Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = htmlNode.getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator elementIterator + * @return element element + */ + public Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa69..32a8b976e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -33,19 +33,22 @@ public Selectable smartContent() { @Override public Selectable links() { - return selectElements(new LinksSelector()); + ElementsUtil elementsUtil = new ElementsUtil(); + return elementsUtil.selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { + ElementsUtil elementsUtil = new ElementsUtil(); XpathSelector xpathSelector = Selectors.xpath(xpath); - return selectElements(xpathSelector); + return elementsUtil.selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { - return selectElements((BaseElementSelector) selector); + ElementsUtil elementsUtil = new ElementsUtil(); + return elementsUtil.selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @@ -55,64 +58,18 @@ public Selectable select(Selector selector) { return selectList(selector); } - /** - * select elements - * - * @param elementSelector elementSelector - * @return result - */ - protected Selectable selectElements(BaseElementSelector elementSelector) { - ListIterator elementIterator = getElements().listIterator(); - if (!elementSelector.hasAttribute()) { - List resultElements = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectElements = elementSelector.selectElements(element); - resultElements.addAll(selectElements); - } - return new HtmlNode(resultElements); - } else { - // has attribute, consider as plaintext - List resultStrings = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectList = elementSelector.selectList(element); - resultStrings.addAll(selectList); - } - return new PlainText(resultStrings); - - } - } - - /** - * Only document can be select - * See: https://github.com/code4craft/webmagic/issues/113 - * - * @param elementIterator elementIterator - * @return element element - */ - private Element checkElementAndConvert(ListIterator elementIterator) { - Element element = elementIterator.next(); - if (!(element instanceof Document)) { - Document root = new Document(element.ownerDocument().baseUri()); - Element clone = element.clone(); - root.appendChild(clone); - elementIterator.set(root); - return root; - } - return element; - } - @Override public Selectable $(String selector) { + ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector); - return selectElements(cssSelector); + return elementsUtil.selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { + ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector, attrName); - return selectElements(cssSelector); + return elementsUtil.selectElements(cssSelector); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index bbc48ddae..5596cfc7f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -76,26 +76,27 @@ public Selector getSelector() { } private Selector compileSelector() { + SelectorFactory factory; switch (expressionType) { case Css: - if (expressionParams.length >= 1) { - return $(expressionValue, expressionParams[0]); - } else { - return $(expressionValue); - } + factory = new CssSelectorFactory(); + break; case XPath: - return xpath(expressionValue); + factory = new XPathSelectorFactory(); + break; case Regex: - if (expressionParams.length >= 1) { - return regex(expressionValue, Integer.parseInt(expressionParams[0])); - } else { - return regex(expressionValue); - } + factory = new RegexSelectorFactory(); + break; case JsonPath: - return new JsonPathSelector(expressionValue); + factory = new JsonPathSelectorFactory(); + break; default: - return xpath(expressionValue); + factory = new XPathSelectorFactory(); // Default to XPath } + + SelectorCompiler selectorCompiler = new SelectorCompiler(factory); + Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams); + return compiledSelector; } public void setSelector(Selector selector) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java new file mode 100644 index 000000000..7bca4ba7a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java @@ -0,0 +1,57 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; +public interface SelectorFactory { + Selector compileSelector(String expressionValue, String[] expressionParams); +} + +class CssSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + } +} + +class XPathSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return xpath(expressionValue); + } +} + +class RegexSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + } +} + +class JsonPathSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return new JsonPathSelector(expressionValue); + } +} + +class SelectorCompiler { + private final SelectorFactory selectorFactory; + + public SelectorCompiler(SelectorFactory selectorFactory) { + this.selectorFactory = selectorFactory; + } + + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return selectorFactory.compileSelector(expressionValue, expressionParams); + } +} \ No newline at end of file From 31548deb93b91b9550a3bfe31aad85d2747a78b8 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 30 Mar 2024 14:37:55 +0800 Subject: [PATCH 11/37] Revert "Refactored code for increased optimization. (#1139)" (#1153) This reverts commit f051d978e2f329de8f30455e6ab658789e328f1c. --- .../main/java/us/codecraft/webmagic/Page.java | 19 ++---- .../downloader/HttpClientGenerator.java | 3 +- .../webmagic/model/HttpRequestBody.java | 2 +- .../webmagic/selector/ElementsUtil.java | 53 ---------------- .../codecraft/webmagic/selector/HtmlNode.java | 63 ++++++++++++++++--- .../webmagic/configurable/ExtractRule.java | 27 ++++---- .../configurable/SelectorFactory.java | 57 ----------------- 7 files changed, 74 insertions(+), 150 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index dc87ece87..b4c161a9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -169,25 +169,18 @@ public void addTargetRequests(Iterable requests, long priority) { * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { - boolean isBlankUrl = StringUtils.isBlank(url); - boolean isHashSymbol = url.equals("#"); - boolean isJavaScript = url.startsWith("javascript:"); - - if (isBlankUrl || isHashSymbol || isJavaScript) { - return; // Invalid URL, so no further processing is needed. + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); - Request request = new Request(canonicalizedUrl); - - if (priority > 0) { - request.setPriority(priority); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } - - targetRequests.add(request); + targetRequests.add(req); } - /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index f32a4eba8..167a5e1c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -40,14 +40,13 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; - private static final int DEFAULT_MAX_PER_ROUTE = 100; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); + connectionManager.setDefaultMaxPerRoute(100); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 23606d86a..7d3b30785 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,7 +64,7 @@ public void setEncoding(String encoding) { this.encoding = encoding; } - public static HttpRequestBody createJsonRequestBody(String json, String encoding) { + public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java deleted file mode 100644 index 10873c710..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java +++ /dev/null @@ -1,53 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.ArrayList; -import java.util.List; -import java.util.ListIterator; - -public class ElementsUtil { - HtmlNode htmlNode = new HtmlNode(); - public Selectable selectElements(BaseElementSelector elementSelector) { - ListIterator elementIterator = htmlNode.getElements().listIterator(); - if (!elementSelector.hasAttribute()) { - List resultElements = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectElements = elementSelector.selectElements(element); - resultElements.addAll(selectElements); - } - return new HtmlNode(resultElements); - } else { - // has attribute, consider as plaintext - List resultStrings = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectList = elementSelector.selectList(element); - resultStrings.addAll(selectList); - } - return new PlainText(resultStrings); - - } - } - - /** - * Only document can be select - * See: https://github.com/code4craft/webmagic/issues/113 - * - * @param elementIterator elementIterator - * @return element element - */ - public Element checkElementAndConvert(ListIterator elementIterator) { - Element element = elementIterator.next(); - if (!(element instanceof Document)) { - Document root = new Document(element.ownerDocument().baseUri()); - Element clone = element.clone(); - root.appendChild(clone); - elementIterator.set(root); - return root; - } - return element; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 32a8b976e..85ff5fa69 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -33,22 +33,19 @@ public Selectable smartContent() { @Override public Selectable links() { - ElementsUtil elementsUtil = new ElementsUtil(); - return elementsUtil.selectElements(new LinksSelector()); + return selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { - ElementsUtil elementsUtil = new ElementsUtil(); XpathSelector xpathSelector = Selectors.xpath(xpath); - return elementsUtil.selectElements(xpathSelector); + return selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { - ElementsUtil elementsUtil = new ElementsUtil(); - return elementsUtil.selectElements((BaseElementSelector) selector); + return selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @@ -58,18 +55,64 @@ public Selectable select(Selector selector) { return selectList(selector); } + /** + * select elements + * + * @param elementSelector elementSelector + * @return result + */ + protected Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator elementIterator + * @return element element + */ + private Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } + @Override public Selectable $(String selector) { - ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector); - return elementsUtil.selectElements(cssSelector); + return selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { - ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector, attrName); - return elementsUtil.selectElements(cssSelector); + return selectElements(cssSelector); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index 5596cfc7f..bbc48ddae 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -76,27 +76,26 @@ public Selector getSelector() { } private Selector compileSelector() { - SelectorFactory factory; switch (expressionType) { case Css: - factory = new CssSelectorFactory(); - break; + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } case XPath: - factory = new XPathSelectorFactory(); - break; + return xpath(expressionValue); case Regex: - factory = new RegexSelectorFactory(); - break; + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } case JsonPath: - factory = new JsonPathSelectorFactory(); - break; + return new JsonPathSelector(expressionValue); default: - factory = new XPathSelectorFactory(); // Default to XPath + return xpath(expressionValue); } - - SelectorCompiler selectorCompiler = new SelectorCompiler(factory); - Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams); - return compiledSelector; } public void setSelector(Selector selector) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java deleted file mode 100644 index 7bca4ba7a..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java +++ /dev/null @@ -1,57 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.selector.JsonPathSelector; -import us.codecraft.webmagic.selector.Selector; - -import static us.codecraft.webmagic.selector.Selectors.*; -public interface SelectorFactory { - Selector compileSelector(String expressionValue, String[] expressionParams); -} - -class CssSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - if (expressionParams.length >= 1) { - return $(expressionValue, expressionParams[0]); - } else { - return $(expressionValue); - } - } -} - -class XPathSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return xpath(expressionValue); - } -} - -class RegexSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - if (expressionParams.length >= 1) { - return regex(expressionValue, Integer.parseInt(expressionParams[0])); - } else { - return regex(expressionValue); - } - } -} - -class JsonPathSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return new JsonPathSelector(expressionValue); - } -} - -class SelectorCompiler { - private final SelectorFactory selectorFactory; - - public SelectorCompiler(SelectorFactory selectorFactory) { - this.selectorFactory = selectorFactory; - } - - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return selectorFactory.compileSelector(expressionValue, expressionParams); - } -} \ No newline at end of file From 0ceaf14882b87fe8606386e16df3ba701e2ad547 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 3 Apr 2024 01:00:47 +0800 Subject: [PATCH 12/37] Bump version number from 0.10.1-SNAPSHOT to 1.0.0-SNAPSHOT for Java version updating from 1.8 to 11, refs #1134. --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index c90394a30..46993a962 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index a6eff4063..98e513c01 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index f9a2f50c8..e1e650276 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index e68385967..2ffedf291 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ad7fae4ce..8b50671aa 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index bff1de3f6..7530e0a91 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index c81c5613b..02d440017 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 8381c0275..92a11795a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 From 2c730eb978191befca63f5a805e88317f13e4470 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 3 Apr 2024 01:14:36 +0800 Subject: [PATCH 13/37] Update Java version from 1.8 to 11, refs #1134. --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 46993a962..fa7ad3f78 100644 --- a/pom.xml +++ b/pom.xml @@ -7,8 +7,8 @@ UTF-8 UTF-8 - 1.8 - 1.8 + 11 + 11 3.23.1 1.5.0 4.4 From 4ebf48f6e3bf0a7057650d0f6c7045a699e9be25 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 3 Apr 2024 18:26:01 +0800 Subject: [PATCH 14/37] Replace log4j 1.x with log4j 2.x, refs #534. --- pom.xml | 42 +++++++++++-------- webmagic-core/pom.xml | 6 --- webmagic-core/src/main/resources/log4j.xml | 21 ---------- webmagic-core/src/test/resources/log4j.xml | 21 ---------- .../src/test/resources/log4j2-test.xml | 16 +++++++ .../src/main/resources/log4j.xml | 21 ---------- .../src/test/resources/log4j.xml | 21 ---------- .../src/test/resources/log4j2-test.xml | 16 +++++++ webmagic-samples/src/main/resources/log4j.xml | 26 ------------ .../src/main/resources/log4j2.xml | 19 +++++++++ webmagic-scripts/pom.xml | 12 ++++-- .../webmagic/scripts/ScriptConsole.java | 8 ++-- webmagic-scripts/src/main/resources/log4j.xml | 21 ---------- webmagic-scripts/src/test/resouces/log4j.xml | 21 ---------- .../src/test/resources/log4j2-test.xml | 16 +++++++ 15 files changed, 105 insertions(+), 182 deletions(-) delete mode 100644 webmagic-core/src/main/resources/log4j.xml delete mode 100644 webmagic-core/src/test/resources/log4j.xml create mode 100644 webmagic-core/src/test/resources/log4j2-test.xml delete mode 100644 webmagic-extension/src/main/resources/log4j.xml delete mode 100644 webmagic-extension/src/test/resources/log4j.xml create mode 100644 webmagic-extension/src/test/resources/log4j2-test.xml delete mode 100644 webmagic-samples/src/main/resources/log4j.xml create mode 100644 webmagic-samples/src/main/resources/log4j2.xml delete mode 100755 webmagic-scripts/src/main/resources/log4j.xml delete mode 100755 webmagic-scripts/src/test/resouces/log4j.xml create mode 100644 webmagic-scripts/src/test/resources/log4j2-test.xml diff --git a/pom.xml b/pom.xml index fa7ad3f78..36d060577 100644 --- a/pom.xml +++ b/pom.xml @@ -25,11 +25,11 @@ 2.8.0 4.13.2 2.7.3 - 1.2.17 + 2.23.1 2.0.2-beta 1.3.0 1.2.0 - 11.4 + 12.4 4.14.1 2.0.4 4.0.0.RELEASE @@ -77,6 +77,19 @@ webmagic-coverage + + + org.apache.logging.log4j + log4j-core + test + + + org.apache.logging.log4j + log4j-slf4j2-impl + test + + + @@ -101,6 +114,16 @@ httpcore ${httpcore.version} + + org.apache.logging.log4j + log4j-core + ${log4j2.version} + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j2.version} + com.google.guava guava @@ -116,11 +139,6 @@ slf4j-api ${slf4j.version} - - org.slf4j - slf4j-log4j12 - ${slf4j.version} - us.codecraft xsoup @@ -143,11 +161,6 @@ - - log4j - log4j - ${log4j.version} - org.assertj assertj-core @@ -274,11 +287,6 @@ org.apache.maven.plugins maven-jar-plugin - - - log4j.xml - - org.apache.maven.plugins diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 98e513c01..37f1d0071 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -45,12 +45,6 @@ mockito-all - - org.slf4j - slf4j-log4j12 - true - - org.apache.commons commons-collections4 diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-core/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-core/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/resources/log4j2-test.xml b/webmagic-core/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..86aee5f59 --- /dev/null +++ b/webmagic-core/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-extension/src/main/resources/log4j.xml b/webmagic-extension/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-extension/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/resources/log4j.xml b/webmagic-extension/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-extension/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/resources/log4j2-test.xml b/webmagic-extension/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..86aee5f59 --- /dev/null +++ b/webmagic-extension/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml deleted file mode 100644 index a6630f813..000000000 --- a/webmagic-samples/src/main/resources/log4j.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-samples/src/main/resources/log4j2.xml b/webmagic-samples/src/main/resources/log4j2.xml new file mode 100644 index 000000000..f3bad53d8 --- /dev/null +++ b/webmagic-samples/src/main/resources/log4j2.xml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 02d440017..243eb829f 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -13,6 +13,14 @@ + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + org.jruby jruby @@ -40,10 +48,6 @@ webmagic-core ${project.version} - - org.slf4j - slf4j-log4j12 - ${project.groupId} webmagic-extension diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 0423e58e1..2ccfe7f4e 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; @@ -166,7 +168,7 @@ private static Params readOptions(CommandLine commandLine) { } private static void configLogger(String value) { - Logger rootLogger = Logger.getRootLogger(); + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); if ("debug".equalsIgnoreCase(value)) { rootLogger.setLevel(Level.DEBUG); } else if ("info".equalsIgnoreCase(value)) { diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml deleted file mode 100755 index 474269cb1..000000000 --- a/webmagic-scripts/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml deleted file mode 100755 index 1f64d8dad..000000000 --- a/webmagic-scripts/src/test/resouces/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/resources/log4j2-test.xml b/webmagic-scripts/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..e2fab6602 --- /dev/null +++ b/webmagic-scripts/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + From ed7429c29322a2755299e801a978f259bf69495c Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 3 Apr 2024 19:37:56 +0800 Subject: [PATCH 15/37] Rename webmagic-parent to webmagic. --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 4 ++-- webmagic-selenium/pom.xml | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pom.xml b/pom.xml index 36d060577..3b8169798 100644 --- a/pom.xml +++ b/pom.xml @@ -35,8 +35,8 @@ 4.0.0.RELEASE 0.3.5 - webmagic-parent - webmagic-parent + webmagic + webmagic A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 37f1d0071..877124fc3 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -2,7 +2,7 @@ us.codecraft - webmagic-parent + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e1e650276..c17309c87 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -7,7 +7,7 @@ us.codecraft - webmagic-parent + webmagic 1.0.0-SNAPSHOT diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 2ffedf291..a234a4f7a 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -2,7 +2,7 @@ us.codecraft - webmagic-parent + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8b50671aa..9de8bcb4c 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 7530e0a91..28b921093 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 243eb829f..7a294e18c 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 @@ -94,4 +94,4 @@ - \ No newline at end of file + diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 92a11795a..87de28eee 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 From 383bea32f6ba5338c65a244d49293d3a34038318 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:44:15 +0800 Subject: [PATCH 16/37] Bump com.jayway.jsonpath:json-path from 2.8.0 to 2.9.0 (#1154) Bumps [com.jayway.jsonpath:json-path](https://github.com/jayway/JsonPath) from 2.8.0 to 2.9.0. - [Release notes](https://github.com/jayway/JsonPath/releases) - [Changelog](https://github.com/json-path/JsonPath/blob/master/changelog.md) - [Commits](https://github.com/jayway/JsonPath/compare/json-path-2.8.0...json-path-2.9.0) --- updated-dependencies: - dependency-name: com.jayway.jsonpath:json-path dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 3b8169798..96bf09ae2 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 4.4.15 3.7.1 9.3.9.0 - 2.8.0 + 2.9.0 4.13.2 2.7.3 2.23.1 From f10fabcb5830c305cb53bd886fde5393a23a224c Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 3 Apr 2024 20:21:02 +0800 Subject: [PATCH 17/37] Update .gitignore, with merging Maven.gitignore & Global/Eclipse.gitignore in github/gitignore. --- .gitignore | 82 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 0175dbaad..3a839a5f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,77 @@ -target -*.iml -out/ -.idea -.classpath +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties +# https://github.com/takari/maven-wrapper#usage-without-binary-jar +.mvn/wrapper/maven-wrapper.jar + +# Eclipse m2e generated files +# Eclipse Core .project -.settings/ +# JDT-specific (Eclipse Java Development Tools) +.classpath +.metadata bin/ -.myeclipse +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project From 05e5eefc7d9e7dd8fd8b85cb297b2f5e30f56e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Fri, 5 Apr 2024 15:51:08 +0200 Subject: [PATCH 18/37] Refactor of processSingle in PageModelExtractor (#1155) --- webmagic-extension/pom.xml | 6 + .../codecraft/webmagic/model/Extractor.java | 24 +-- .../webmagic/model/FieldExtractor.java | 40 +---- .../webmagic/model/PageModelExtractor.java | 154 ++---------------- .../webmagic/model/fields/MultipleField.java | 42 +++++ .../webmagic/model/fields/PageField.java | 31 ++++ .../webmagic/model/fields/SingleField.java | 28 ++++ .../model/selections/MultipleSelection.java | 36 ++++ .../webmagic/model/selections/Selection.java | 9 + .../model/selections/SingleSelection.java | 33 ++++ .../webmagic/utils/DoubleKeyMap.java | 0 .../webmagic/utils/MultiKeyMapBase.java | 0 12 files changed, 217 insertions(+), 186 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java mode change 100755 => 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java mode change 100755 => 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index a234a4f7a..8d2c07003 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -10,6 +10,12 @@ webmagic-extension + + org.projectlombok + lombok + 1.18.32 + provided + redis.clients jedis diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index f1d2f84d4..d64adffd7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.model; +import lombok.Getter; +import lombok.Setter; import us.codecraft.webmagic.selector.Selector; /** @@ -7,17 +9,19 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -class Extractor { +public class Extractor { + @Getter @Setter protected Selector selector; + @Getter protected final Source source; protected final boolean notNull; protected final boolean multi; - static enum Source {Html, Url, RawHtml, RawText} + public static enum Source {Html, Url, RawHtml, RawText} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; @@ -26,23 +30,11 @@ public Extractor(Selector selector, Source source, boolean notNull, boolean mult this.multi = multi; } - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - boolean isNotNull() { + public boolean isNotNull() { return notNull; } - boolean isMulti() { + public boolean isMulti() { return multi; } - - void setSelector(Selector selector) { - this.selector = selector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a2cba1332..a49ea7766 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -6,53 +6,27 @@ import java.lang.reflect.Field; import java.lang.reflect.Method; +import lombok.Getter; +import lombok.Setter; + /** * Wrapper of field and extractor. * @author code4crafter@gmail.com
* @since 0.2.0 */ -class FieldExtractor extends Extractor { +public class FieldExtractor extends Extractor { + @Getter private final Field field; + @Getter @Setter private Method setterMethod; + @Getter @Setter private ObjectFormatter objectFormatter; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; } - - Field getField() { - return field; - } - - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - void setSetterMethod(Method setterMethod) { - this.setterMethod = setterMethod; - } - - Method getSetterMethod() { - return setterMethod; - } - - boolean isNotNull() { - return notNull; - } - - ObjectFormatter getObjectFormatter() { - return objectFormatter; - } - - void setObjectFormatter(ObjectFormatter objectFormatter) { - this.objectFormatter = objectFormatter; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index d8947ded6..de71717fd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -3,17 +3,21 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import lombok.Getter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; -import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; +import us.codecraft.webmagic.model.selections.MultipleSelection; +import us.codecraft.webmagic.model.selections.Selection; +import us.codecraft.webmagic.model.selections.SingleSelection; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; @@ -29,14 +33,19 @@ */ class PageModelExtractor { + @Getter private List targetUrlPatterns = new ArrayList(); + @Getter private Selector targetUrlRegionSelector; + @Getter private List helpUrlPatterns = new ArrayList(); + @Getter private Selector helpUrlRegionSelector; + @Getter private Class clazz; private List fieldExtractors; @@ -233,145 +242,16 @@ private Object processSingle(Page page, String html, boolean isRaw) { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.isMulti()) { - List value=getMultiValueFromSource(page, fieldExtractor, html, isRaw); - if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - List converted = convertMultiValue(value, fieldExtractor.getObjectFormatter()); - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } else { - String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw); - if (value == null && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter()); - if (converted == null && fieldExtractor.isNotNull()) { - return null; - } - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } + Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection(); + PageField field = selection.extractField(page, html, isRaw, fieldExtractor); + if (!field.operation(o, fieldExtractor, logger)) + return null; } - if (AfterExtractor.class.isAssignableFrom(clazz)) { + if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); - } - } catch (InstantiationException e) { - logger.error("extract fail", e); - } catch (IllegalAccessException e) { - logger.error("extract fail", e); - } catch (InvocationTargetException e) { + } catch (Exception e) { logger.error("extract fail", e); } return o; } - - private List getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } - return value; - } - - private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } - return value; - } - - private Object convertSingleValue(String value, ObjectFormatter objectFormatter) { - try { - Object format = objectFormatter.format(value); - logger.debug("String {} is converted to {}", value, format); - return format; - } catch (Exception e) { - logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); - } - return null; - } - - private List convertMultiValue(List values, ObjectFormatter objectFormatter) { - List objects = new ArrayList(); - for (String value : values) { - Object converted = convertSingleValue(value, objectFormatter); - if (converted != null) { - objects.add(converted); - } - } - return objects; - } - - private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { - if (value == null) { - return; - } - if (fieldExtractor.getSetterMethod() != null) { - fieldExtractor.getSetterMethod().invoke(o, value); - } - fieldExtractor.getField().set(o, value); - } - - Class getClazz() { - return clazz; - } - - List getTargetUrlPatterns() { - return targetUrlPatterns; - } - - List getHelpUrlPatterns() { - return helpUrlPatterns; - } - - Selector getTargetUrlRegionSelector() { - return targetUrlRegionSelector; - } - - Selector getHelpUrlRegionSelector() { - return helpUrlRegionSelector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java new file mode 100644 index 000000000..4a4bf38a8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public class MultipleField extends PageField { + @Getter + private List fieldNames; + + public MultipleField(List fieldNames) { + this.fieldNames = fieldNames; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull()) + return false; + if (fieldExtractor.getObjectFormatter() != null) { + List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger); + setField(o, fieldExtractor, converted); + } + else + setField(o, fieldExtractor, this.fieldNames); + return true; + } + + private List convert(List values, ObjectFormatter objectFormatter, Logger logger) { + List objects = new ArrayList<>(); + for (String value : values) { + Object converted = this.convert(value, objectFormatter, logger); + if (converted != null) + objects.add(converted); + } + return objects; + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java new file mode 100644 index 000000000..ad4428335 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public abstract class PageField { + public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException; + + protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) { + try { + Object format = objectFormatter.format(value); + logger.debug("String {} is converted to {}", value, format); + return format; + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value != null) { + if (fieldExtractor.getSetterMethod() != null) + fieldExtractor.getSetterMethod().invoke(o, value); + fieldExtractor.getField().set(o, value); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java new file mode 100644 index 000000000..136a1c56e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; + +public class SingleField extends PageField { + @Getter + private String fieldName; + + public SingleField(String fieldName) { + this.fieldName = fieldName; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger); + if (converted == null && fieldExtractor.isNotNull()) + return false; + setField(o, fieldExtractor, converted); + } else + setField(o, fieldExtractor, this.fieldName); + return true; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java new file mode 100644 index 000000000..d49f9c576 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.model.selections; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; + +public class MultipleSelection implements Selection { + public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + List fieldsName; + switch (fieldExtractor.getSource()) { + case RawHtml: + fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) + fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + fieldsName = fieldExtractor.getSelector().selectList(html); + break; + case Url: + fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + case RawText: + fieldsName = fieldExtractor.getSelector().selectList(page.getRawText()); + break; + default: + fieldsName = fieldExtractor.getSelector().selectList(html); + } + if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) { + return null; + } + return new MultipleField(fieldsName); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java new file mode 100644 index 000000000..e70ab9d9b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java @@ -0,0 +1,9 @@ +package us.codecraft.webmagic.model.selections; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.PageField; + +public interface Selection { + public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java new file mode 100644 index 000000000..a4c1fe452 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.model.selections; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SingleSelection implements Selection { + public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + String field; + switch (fieldExtractor.getSource()) { + case RawHtml: + field = page.getHtml().selectDocument(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) + field = page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + field = fieldExtractor.getSelector().select(html); + break; + case Url: + field = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + case RawText: + field = fieldExtractor.getSelector().select(page.getRawText()); + break; + default: + field = fieldExtractor.getSelector().select(html); + } + if (field == null && fieldExtractor.isNotNull()) + return null; + return new SingleField(field); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java old mode 100755 new mode 100644 From 2df7dca8711d226dd98bd0afefa4531a6d1e44b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Fri, 5 Apr 2024 16:50:21 +0200 Subject: [PATCH 19/37] Changed refactor of processSingle again, this one is a better version (#1157) * Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot better --- .../codecraft/webmagic/model/Extractor.java | 6 +- .../webmagic/model/FieldExtractor.java | 1 + .../webmagic/model/PageModelExtractor.java | 36 +++++----- .../model/selections/MultipleSelection.java | 36 ---------- .../webmagic/model/selections/Selection.java | 9 --- .../model/selections/SingleSelection.java | 33 --------- .../webmagic/model/sources/Source.java | 68 +++++++++++++++++++ .../model/sources/SourceTextExtractor.java | 17 +++++ 8 files changed, 105 insertions(+), 101 deletions(-) delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index d64adffd7..673447586 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -2,6 +2,8 @@ import lombok.Getter; import lombok.Setter; + +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; /** @@ -20,9 +22,7 @@ public class Extractor { protected final boolean notNull; protected final boolean multi; - - public static enum Source {Html, Url, RawHtml, RawText} - + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; this.source = source; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a49ea7766..d4cb5937f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index de71717fd..751aafe76 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -9,9 +9,9 @@ import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; -import us.codecraft.webmagic.model.selections.MultipleSelection; -import us.codecraft.webmagic.model.selections.Selection; -import us.codecraft.webmagic.model.selections.SingleSelection; +import us.codecraft.webmagic.model.sources.Source; +import us.codecraft.webmagic.model.sources.SourceTextExtractor; +import us.codecraft.webmagic.model.sources.Source.*; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; @@ -95,7 +95,7 @@ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, - new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), + new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -121,7 +121,7 @@ private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) { default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -136,26 +136,23 @@ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - ExtractBy.Source source0 = extractBy.source(); - if (extractBy.type()== ExtractBy.Type.JsonPath){ - source0 = RawText; - } - FieldExtractor.Source source = null; - switch (source0){ + ExtractBy.Source extractSource = extractBy.source(); + if (extractBy.type()== ExtractBy.Type.JsonPath) + extractSource = RawText; + Source source = null; + switch (extractSource) { case RawText: - source = FieldExtractor.Source.RawText; + source = new RawText(); break; case RawHtml: - source = FieldExtractor.Source.RawHtml; + source = new RawHtml(); break; case SelectedHtml: - source =FieldExtractor.Source.Html; + source = new SelectedHtml(); break; default: - source =FieldExtractor.Source.Html; - + source = new SelectedHtml(); } - fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); @@ -202,7 +199,7 @@ private void initClassExtractors() { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } @@ -242,8 +239,7 @@ private Object processSingle(Page page, String html, boolean isRaw) { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection(); - PageField field = selection.extractField(page, html, isRaw, fieldExtractor); + PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); if (!field.operation(o, fieldExtractor, logger)) return null; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java deleted file mode 100644 index d49f9c576..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java +++ /dev/null @@ -1,36 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import java.util.List; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.MultipleField; - -public class MultipleSelection implements Selection { - public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { - List fieldsName; - switch (fieldExtractor.getSource()) { - case RawHtml: - fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) - fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - else - fieldsName = fieldExtractor.getSelector().selectList(html); - break; - case Url: - fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - fieldsName = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - fieldsName = fieldExtractor.getSelector().selectList(html); - } - if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - return new MultipleField(fieldsName); - } -} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java deleted file mode 100644 index e70ab9d9b..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java +++ /dev/null @@ -1,9 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.PageField; - -public interface Selection { - public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java deleted file mode 100644 index a4c1fe452..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.SingleField; - -public class SingleSelection implements Selection { - public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { - String field; - switch (fieldExtractor.getSource()) { - case RawHtml: - field = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) - field = page.getHtml().selectDocument(fieldExtractor.getSelector()); - else - field = fieldExtractor.getSelector().select(html); - break; - case Url: - field = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - field = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - field = fieldExtractor.getSelector().select(html); - } - if (field == null && fieldExtractor.isNotNull()) - return null; - return new SingleField(field); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java new file mode 100644 index 000000000..146827220 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.model.sources; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; + +public interface Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + + public class RawHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } + } + + public class SelectedHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().selectList(html); + } + } + + public class Url implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getUrl().toString()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getUrl().toString()); + } + } + + public class RawText implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getRawText()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getRawText()); + } + } + + public class DefaultSource implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(html); + } + } +} + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java new file mode 100644 index 000000000..1e572695f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.model.sources; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; +import us.codecraft.webmagic.model.fields.PageField; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SourceTextExtractor { + public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + Source source = fieldExtractor.getSource(); + if (fieldExtractor.isMulti()) + return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor)); + else + return new SingleField(source.getText(page, html, isRaw, fieldExtractor)); + } +} \ No newline at end of file From d8321baf560e4d5742909c33d8f1dacee590fea0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Sat, 6 Apr 2024 01:55:46 +0200 Subject: [PATCH 20/37] Refactored and implement of a template method pattern for logger config in webmagic-scripts (#1158) * Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot better * add lombok for getters and setters * Refactored and implement of a template method pattern for logger config --- webmagic-scripts/pom.xml | 6 + .../us/codecraft/webmagic/scripts/Params.java | 47 +++++++ .../webmagic/scripts/ScriptConsole.java | 117 +----------------- .../webmagic/scripts/ScriptEnginePool.java | 6 +- .../webmagic/scripts/ScriptProcessor.java | 35 +----- .../scripts/ScriptProcessorBuilder.java | 7 +- .../scripts/config/CommandLineOption.java | 82 ++++++++++++ .../webmagic/scripts/config/ConfigLogger.java | 34 +++++ .../webmagic/scripts/languages/JRuby.java | 26 ++++ .../scripts/languages/Javascript.java | 16 +++ .../webmagic/scripts/languages/Jython.java | 27 ++++ .../scripts/{ => languages}/Language.java | 29 +++-- .../webmagic/scripts/ScriptProcessorTest.java | 10 +- 13 files changed, 274 insertions(+), 168 deletions(-) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/CommandLineOption.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java rename webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/{ => languages}/Language.java (51%) mode change 100755 => 100644 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 7a294e18c..aa5a47981 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -53,6 +53,12 @@ webmagic-extension ${project.version} + + org.projectlombok + lombok + 1.18.32 + provided + diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java new file mode 100644 index 000000000..873176e6e --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.scripts; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import lombok.Getter; +import lombok.Setter; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Language; +import us.codecraft.webmagic.utils.WMCollections; + +public class Params { + @Getter + Language language = new Javascript(); + + @Getter @Setter + String scriptFileName; + + @Getter @Setter + List urls; + + @Getter @Setter + int thread = 1; + + @Getter @Setter + int sleepTime = 1000; + + private static Map> alias; + + public Params() { + alias = new HashMap>(); + alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + } + + public void setLanguagefromArg(String arg) { + for (Map.Entry> languageSetEntry : alias.entrySet()) { + if (languageSetEntry.getValue().contains(arg)) { + this.language = languageSetEntry.getKey(); + return; + } + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 2ccfe7f4e..c60b3ec3d 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,90 +1,21 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.core.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.scripts.config.CommandLineOption; import us.codecraft.webmagic.utils.WMCollections; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Set; /** - * @author code4crafter@gmail.com + * @author code4crafter@gmail.com / FrancoisGib * @since 0.4.1 */ public class ScriptConsole { - - private static class Params { - Language language = Language.JavaScript; - String scriptFileName; - List urls; - int thread = 1; - int sleepTime = 1000; - private static Map> alias = new HashMap>(); - - static { - alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); - alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); - } - - public void setLanguagefromArg(String arg) { - for (Map.Entry> languageSetEntry : alias.entrySet()) { - if (languageSetEntry.getValue().contains(arg)) { - this.language = languageSetEntry.getKey(); - return; - } - } - } - - private Language getLanguage() { - return language; - } - - private void setLanguage(Language language) { - this.language = language; - } - - private String getScriptFileName() { - return scriptFileName; - } - - private void setScriptFileName(String scriptFileName) { - this.scriptFileName = scriptFileName; - } - - private List getUrls() { - return urls; - } - - private void setUrls(List urls) { - this.urls = urls; - } - - private int getThread() { - return thread; - } - - private void setThread(int thread) { - this.thread = thread; - } - - private int getSleepTime() { - return sleepTime; - } - - private void setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - } - } - public static void main(String[] args) { Params params = parseCommand(args); startSpider(params); @@ -142,45 +73,9 @@ private static void exit() { private static Params readOptions(CommandLine commandLine) { Params params = new Params(); - if (commandLine.hasOption("l")) { - String language = commandLine.getOptionValue("l"); - params.setLanguagefromArg(language); - } - if (commandLine.hasOption("f")) { - String scriptFilename = commandLine.getOptionValue("f"); - params.setScriptFileName(scriptFilename); - } else { - exit(); - } - if (commandLine.hasOption("s")) { - Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); - params.setSleepTime(sleepTime); - } - if (commandLine.hasOption("t")) { - Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); - params.setThread(thread); - } - if (commandLine.hasOption("g")) { - configLogger(commandLine.getOptionValue("g")); - } - params.setUrls(commandLine.getArgList()); + List options = CommandLineOption.getAllOptions(); + for (CommandLineOption option : options) + option.addParamOptionIfInCommandLine(params, commandLine); return params; } - - private static void configLogger(String value) { - Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); - if ("debug".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.DEBUG); - } else if ("info".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.INFO); - } else if ("warn".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.WARN); - } else if ("trace".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.TRACE); - } else if ("off".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.OFF); - } else if ("error".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.ERROR); - } - } -} +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java index d1e5d7fe8..bdfbbaedb 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java @@ -2,6 +2,9 @@ import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; + +import us.codecraft.webmagic.scripts.languages.Language; + import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -11,14 +14,11 @@ */ public class ScriptEnginePool { - private final int size; - private final AtomicInteger availableCount; private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); public ScriptEnginePool(Language language,int size) { - this.size = size; this.availableCount = new AtomicInteger(size); for (int i=0;i getAllOptions() { + return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG()); + } +} + +class OptionL extends CommandLineOption { + public OptionL() { + super('l'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String language = commandLine.getOptionValue("l"); + params.setLanguagefromArg(language); + } +} + +class OptionF extends CommandLineOption { + public OptionF() { + super('f'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String scriptFilename = commandLine.getOptionValue("f"); + params.setScriptFileName(scriptFilename); + } +} + +class OptionS extends CommandLineOption { + public OptionS() { + super('s'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); + params.setSleepTime(sleepTime); + } +} + +class OptionT extends CommandLineOption { + public OptionT() { + super('t'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); + params.setThread(thread); + } +} + +class OptionG extends CommandLineOption { + public OptionG() { + super('g'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + ConfigLogger.configLogger(commandLine.getOptionValue("g")); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java new file mode 100644 index 000000000..9e81ea6c7 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.scripts.config; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + +public class ConfigLogger { + /** + * Log the config parameter. If the counter is less than the number of available + * options then it means that the user entered an option + * + * @param value The config string + */ + public static void configLogger(String value) { + List> options = List.of( + Pair.of("debug", Level.DEBUG), + Pair.of("info", Level.INFO), + Pair.of("warn", Level.WARN), + Pair.of("trace", Level.TRACE), + Pair.of("off", Level.OFF), + Pair.of("error", Level.ERROR)); + Pair option = options.get(0); + int i = 1; + while (i < options.size() && !option.getLeft().equalsIgnoreCase(value)) + option = options.get(i++); + if (i < options.size()) { + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(option.getRight()); + } + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java new file mode 100644 index 000000000..b3a3209a5 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.jruby.RubyHash; + +import us.codecraft.webmagic.Page; + +public class JRuby extends Language { + public JRuby() { + super("jruby","ruby/defines.rb",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext()); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java new file mode 100644 index 000000000..b0f7b647a --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import us.codecraft.webmagic.Page; + +public class Javascript extends Language { + public Javascript() { + super("javascript","js/defines.js",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java new file mode 100644 index 000000000..9124d2dbb --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.python.core.PyDictionary; + +import us.codecraft.webmagic.Page; + +public class Jython extends Language { + public Jython() { + super("jython","python/defines.py",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java old mode 100755 new mode 100644 similarity index 51% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java index 2f9d22d57..44e6ba0a0 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java @@ -1,15 +1,18 @@ -package us.codecraft.webmagic.scripts; +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; +import us.codecraft.webmagic.Page; /** - * @author code4crafter@gmail.com + * @author FrancoisGib */ -public enum Language { - - JavaScript("javascript","js/defines.js",""), - - JRuby("jruby","ruby/defines.rb",""), - - Jython("jython","python/defines.py",""); +public abstract class Language { + public Language(String engineName, String defineFile, String gatherFile) { + this.engineName = engineName; + this.defineFile = defineFile; + this.gatherFile = gatherFile; + } private String engineName; @@ -17,12 +20,6 @@ public enum Language { private String gatherFile; - Language(String engineName, String defineFile, String gatherFile) { - this.engineName = engineName; - this.defineFile = defineFile; - this.gatherFile = gatherFile; - } - public String getEngineName() { return engineName; } @@ -34,4 +31,6 @@ public String getDefineFile() { public String getGatherFile() { return gatherFile; } + + public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException; } diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java index ffeb9c993..b4c28521f 100755 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -2,7 +2,11 @@ import org.junit.Ignore; import org.junit.Test; + import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Jython; /** * @author code4crafter@gmail.com @@ -13,14 +17,14 @@ public class ScriptProcessorTest { @Test public void testJavaScriptProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testRubyProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @@ -28,7 +32,7 @@ public void testRubyProcessor() { @Test public void testPythonProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } From b7e0d360ec68ab5c91e28e2d95e3cdb04670a211 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 22:04:48 +0800 Subject: [PATCH 21/37] Upgrade junit from 4.13.2 to 5.10.2. --- pom.xml | 49 ++++++++++++++++++++++++++++++++------ webmagic-core/pom.xml | 5 ---- webmagic-extension/pom.xml | 4 ---- webmagic-samples/pom.xml | 4 ---- webmagic-saxon/pom.xml | 4 ---- webmagic-scripts/pom.xml | 5 ---- webmagic-selenium/pom.xml | 4 ---- 7 files changed, 42 insertions(+), 33 deletions(-) diff --git a/pom.xml b/pom.xml index 96bf09ae2..f08b3b543 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,8 @@ 3.7.1 9.3.9.0 2.9.0 - 4.13.2 + 5.10.2 + 1.10.2 2.7.3 2.23.1 2.0.2-beta @@ -88,16 +89,30 @@ log4j-slf4j2-impl test + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.platform + junit-platform-launcher + test + + + org.junit.platform + junit-platform-runner + test + - - junit - junit - ${junit.version} - test - org.mockito mockito-all @@ -134,6 +149,26 @@ json-path ${json-path.version} + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + + + org.junit.vintage + junit-vintage-engine + ${junit.version} + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + + + org.junit.platform + junit-platform-runner + ${junit.platform.version} + org.slf4j slf4j-api diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 877124fc3..9f2eda76c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -15,11 +15,6 @@ httpclient - - junit - junit - - org.apache.commons commons-lang3 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 8d2c07003..b72922317 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -35,10 +35,6 @@ webmagic-core ${project.version} - - junit - junit - diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 9de8bcb4c..41a4b7b45 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -20,10 +20,6 @@ webmagic-extension ${project.version} - - junit - junit - org.mapdb mapdb diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 28b921093..930f5b32c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -23,10 +23,6 @@ net.sf.saxon Saxon-HE - - junit - junit - diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index aa5a47981..676ffd1a0 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -38,11 +38,6 @@ commons-cli commons-cli - - junit - junit - test - ${project.groupId} webmagic-core diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 87de28eee..86b65daf9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -23,10 +23,6 @@ com.github.detro phantomjsdriver - - junit - junit - From dba166830625fa69ae9817ec6409e22a24a83a03 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:23:32 +0800 Subject: [PATCH 22/37] Add tests to test the equals & hashCode of Proxy. --- .../codecraft/webmagic/proxy/ProxyTest.java | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 8e4c82026..cff25b0ec 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -8,18 +8,18 @@ import java.util.List; import org.apache.http.HttpHost; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; /** * @author yxssfxwzy@sina.com May 30, 2014 - * + * */ public class ProxyTest { private static List httpProxyList = new ArrayList(); - @BeforeClass + @BeforeAll public static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; @@ -48,7 +48,7 @@ public void run() { } @Test - public void testCreate() { + void testCreate() { Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); assertNull(proxy.getScheme()); assertNull(proxy.getUsername()); @@ -86,7 +86,15 @@ public void testCreate() { } @Test - public void testToString() { + void testEqualsHashCode() { + var proxy0 = new Proxy("::1", 1080); + var proxy1 = new Proxy("::1", 1080); + assertEquals(proxy0, proxy1); + assertEquals(proxy0.hashCode(), proxy1.hashCode()); + } + + @Test + void testToString() { assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); From 5196a56ccf7aee374b44a02a1e9a414496431938 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:30:43 +0800 Subject: [PATCH 23/37] Format code. --- pom.xml | 7 +++- webmagic-core/pom.xml | 7 +++- webmagic-coverage/pom.xml | 6 ++- webmagic-extension/pom.xml | 7 +++- webmagic-samples/pom.xml | 7 +++- webmagic-saxon/pom.xml | 7 +++- webmagic-scripts/pom.xml | 7 +++- webmagic-selenium/pom.xml | 75 ++++++++++++++++++++------------------ 8 files changed, 80 insertions(+), 43 deletions(-) diff --git a/pom.xml b/pom.xml index f08b3b543..4ec241db7 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 9f2eda76c..f6530b467 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c17309c87..c53a30c28 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -1,7 +1,9 @@ - 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b72922317..9290c18fc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 41a4b7b45..3eff105e1 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 930f5b32c..b528d8ae6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 676ffd1a0..86e36c7da 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 86b65daf9..831cfecf8 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,41 +1,46 @@ - - - us.codecraft - webmagic - 1.0.0-SNAPSHOT - - 4.0.0 + + + us.codecraft + webmagic + 1.0.0-SNAPSHOT + + 4.0.0 - webmagic-selenium + webmagic-selenium - - - org.seleniumhq.selenium - selenium-java - - - ${project.groupId} - webmagic-core - ${project.version} - - - com.github.detro - phantomjsdriver - - + + + org.seleniumhq.selenium + selenium-java + + + ${project.groupId} + webmagic-core + ${project.version} + + + com.github.detro + phantomjsdriver + + - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - + + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + true + + + + From 9ab342c3a782db8ad95e8e3ce1cff2cb4d8b158d Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:31:21 +0800 Subject: [PATCH 24/37] Remove public modifiers from junit5 test methods. --- .../src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index cff25b0ec..61fc6ab8b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -15,12 +15,12 @@ * @author yxssfxwzy@sina.com May 30, 2014 * */ -public class ProxyTest { +class ProxyTest { private static List httpProxyList = new ArrayList(); @BeforeAll - public static void before() { + static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; From 5344db0106b80568b1b4bee26af8f9dcce2f521f Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:35:26 +0800 Subject: [PATCH 25/37] Upgrade jacoco-maven-plugin from 0.8.8 to 0.8.12. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4ec241db7..459930adc 100644 --- a/pom.xml +++ b/pom.xml @@ -468,7 +468,7 @@ org.jacoco jacoco-maven-plugin - 0.8.8 + 0.8.12 com.amashchenko.maven.plugin From e34b495625766b66de6ce954f3a38c9efc170027 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 22 Apr 2024 00:17:58 +0800 Subject: [PATCH 26/37] Upgrade maven-suirefire-plugin from 3.0.0-M7 to 3.2.5. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 459930adc..d629db370 100644 --- a/pom.xml +++ b/pom.xml @@ -453,7 +453,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.0.0-M7 + 3.2.5 org.apache.maven.plugins From a5144350bddea084e5e88bbac2a71642f224e0ff Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 22 Apr 2024 00:45:54 +0800 Subject: [PATCH 27/37] Upgrade maven plugins to latest versions. --- pom.xml | 55 +++++++++++++++++++++++++++--------------- webmagic-saxon/pom.xml | 17 +++---------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/pom.xml b/pom.xml index d629db370..47f8c3546 100644 --- a/pom.xml +++ b/pom.xml @@ -275,7 +275,6 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.1.0 enforce-maven @@ -285,7 +284,7 @@ - 3.5.0 + 3.6.3 @@ -331,7 +330,6 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 attach-sources @@ -344,9 +342,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.4.1 - UTF-8 WebMagic ${project.version} en_US @@ -373,7 +369,6 @@ org.apache.maven.plugins maven-release-plugin - 3.0.0-M6 org.jacoco @@ -408,47 +403,67 @@ org.apache.maven.plugins maven-clean-plugin - 3.2.0 + 3.3.2 org.apache.maven.plugins maven-compiler-plugin - 3.10.1 + 3.13.0 org.apache.maven.plugins maven-deploy-plugin - 3.0.0 + 3.1.1 + + + org.apache.maven.plugins + maven-enforcer-plugin + 3.4.1 org.apache.maven.plugins maven-install-plugin - 3.0.1 + 3.1.1 org.apache.maven.plugins maven-jar-plugin - 3.3.0 + 3.4.1 + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.6.3 org.apache.maven.plugins maven-jxr-plugin - 3.3.0 + 3.3.2 org.apache.maven.plugins maven-pmd-plugin - 3.19.0 + 3.21.2 + + + org.apache.maven.plugins + maven-release-plugin + 3.0.1 org.apache.maven.plugins maven-resources-plugin - 3.3.0 + 3.3.1 org.apache.maven.plugins maven-site-plugin - 4.0.0-M3 + 4.0.0-M13 + + + org.apache.maven.plugins + maven-source-plugin + 3.3.0 org.apache.maven.plugins @@ -458,7 +473,7 @@ org.apache.maven.plugins maven-surefire-report-plugin - 3.0.0-M7 + 3.2.5 org.codehaus.mojo @@ -473,12 +488,12 @@ com.amashchenko.maven.plugin gitflow-maven-plugin - 1.18.0 + 1.21.0 com.github.spotbugs spotbugs-maven-plugin - 4.7.2.0 + 4.8.4.0 @@ -525,7 +540,7 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 + 3.3.0 package @@ -553,7 +568,7 @@ org.apache.maven.plugins maven-gpg-plugin - 3.0.1 + 3.2.4 verify diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b528d8ae6..2c5bc9597 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -14,6 +14,10 @@ webmagic-saxon + + true + + ${project.groupId} @@ -30,17 +34,4 @@ - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - - From 884f51ba3bf336cc79b1487ca4faef644fe4bd76 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 20:38:03 +0800 Subject: [PATCH 28/37] Update to hotfix version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 47f8c3546..a0b38ce15 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f6530b467..2dad0a0a9 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c53a30c28..3d42cd618 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 9290c18fc..76eeec0fe 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 3eff105e1..d5849aecf 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2c5bc9597..5cb3b473c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 86e36c7da..14d92f078 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 831cfecf8..22239c3ae 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 From a81c4e7627853623c8e1661f7fd1c7e47e8321bf Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 21:46:35 +0800 Subject: [PATCH 29/37] Update to hotfix version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index a0b38ce15..e4d5607c4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.10.1 + 0.10.2 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 2dad0a0a9..13afbf7e1 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 3d42cd618..d928636d3 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 76eeec0fe..18ce75e8b 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 035ec7226..00d810c99 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 5cb3b473c..743327fc5 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 14d92f078..d69164b54 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 22239c3ae..d40ebd2db 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 From 38f240c42e341da0a11ce2c04f35cba7f654e142 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 23:39:23 +0800 Subject: [PATCH 30/37] Update to hotfix version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index e4d5607c4..1b2aabb17 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.10.2 + 0.10.3 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 13afbf7e1..3c9ca0078 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index d928636d3..0c09d4047 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 18ce75e8b..bcf473be1 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 00d810c99..eb2ed69cd 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 743327fc5..fd993e09c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index d69164b54..e31d57218 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d40ebd2db..3f8aa6951 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 From 16a4fe3e28af963a9ce61bda14d2497bf914191e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 17 May 2024 13:17:13 +0800 Subject: [PATCH 31/37] Use oxerr-parent instead. --- pom.xml | 283 +----------------- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- .../ConfigurablePageProcessorTest.java | 1 - .../model/ModelPageProcessorTest.java | 1 - webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 10 files changed, 14 insertions(+), 285 deletions(-) diff --git a/pom.xml b/pom.xml index 1b2aabb17..eee06779e 100644 --- a/pom.xml +++ b/pom.xml @@ -5,9 +5,14 @@ xsi:schemaLocation=" http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - us.codecraft - 0.10.3 4.0.0 + + org.oxerr + oxerr-parent + 2.1.0 + + us.codecraft + 1.0.0-SNAPSHOT pom UTF-8 @@ -272,73 +277,6 @@ - - org.apache.maven.plugins - maven-enforcer-plugin - - - enforce-maven - - enforce - - - - - 3.6.3 - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - - - - - - - - - - - - - - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - org.apache.maven.plugins maven-javadoc-plugin @@ -366,10 +304,6 @@ - - org.apache.maven.plugins - maven-release-plugin - org.jacoco jacoco-maven-plugin @@ -398,209 +332,6 @@ - - - - org.apache.maven.plugins - maven-clean-plugin - 3.3.2 - - - org.apache.maven.plugins - maven-compiler-plugin - 3.13.0 - - - org.apache.maven.plugins - maven-deploy-plugin - 3.1.1 - - - org.apache.maven.plugins - maven-enforcer-plugin - 3.4.1 - - - org.apache.maven.plugins - maven-install-plugin - 3.1.1 - - - org.apache.maven.plugins - maven-jar-plugin - 3.4.1 - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.6.3 - - - org.apache.maven.plugins - maven-jxr-plugin - 3.3.2 - - - org.apache.maven.plugins - maven-pmd-plugin - 3.21.2 - - - org.apache.maven.plugins - maven-release-plugin - 3.0.1 - - - org.apache.maven.plugins - maven-resources-plugin - 3.3.1 - - - org.apache.maven.plugins - maven-site-plugin - 4.0.0-M13 - - - org.apache.maven.plugins - maven-source-plugin - 3.3.0 - - - org.apache.maven.plugins - maven-surefire-plugin - 3.2.5 - - - org.apache.maven.plugins - maven-surefire-report-plugin - 3.2.5 - - - org.codehaus.mojo - taglist-maven-plugin - 3.0.0 - - - org.jacoco - jacoco-maven-plugin - 0.8.12 - - - com.amashchenko.maven.plugin - gitflow-maven-plugin - 1.21.0 - - - com.github.spotbugs - spotbugs-maven-plugin - 4.8.4.0 - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - none - - - - org.apache.maven.plugins - maven-jxr-plugin - - - org.apache.maven.plugins - maven-pmd-plugin - - - org.apache.maven.plugins - maven-surefire-report-plugin - - - org.codehaus.mojo - taglist-maven-plugin - - - com.github.spotbugs - spotbugs-maven-plugin - - - - - - - release - - - - - org.apache.maven.plugins - maven-source-plugin - 3.3.0 - - - package - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.4.1 - - - package - - jar - - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 3.2.4 - - - verify - - sign - - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.13 - true - - sonatype-nexus-staging - https://oss.sonatype.org/ - true - - - - - - - sonatype-nexus-snapshots - https://oss.sonatype.org/content/repositories/snapshots/ - - - sonatype-nexus-staging - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 3c9ca0078..f6530b467 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 0c09d4047..c53a30c28 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index bcf473be1..9290c18fc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java index 63c40d295..c2081dbf3 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -13,7 +13,6 @@ /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public class ConfigurablePageProcessorTest { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java index 627fa6e84..1014a45f5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -12,7 +12,6 @@ /** * @author code4crafter@gmail.com - * @date 14-4-4 */ public class ModelPageProcessorTest { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index eb2ed69cd..f1da70165 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index fd993e09c..2c5bc9597 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index e31d57218..86e36c7da 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 3f8aa6951..831cfecf8 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 From 7d2d2244b3f5c830f1e9258f28ab669e3596eaa2 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 21 May 2024 12:55:05 +0800 Subject: [PATCH 32/37] Upgrade oxerr-parent from 2.1.0 to 2.2.1. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index eee06779e..333cf41d6 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.oxerr oxerr-parent - 2.1.0 + 2.2.1 us.codecraft 1.0.0-SNAPSHOT From d2aebc60a7cb72fbd8107c844983e24543e106e4 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 4 Jun 2024 00:57:28 +0800 Subject: [PATCH 33/37] Make getCharset to support null parameter. --- .../src/main/java/us/codecraft/webmagic/utils/UrlUtils.java | 4 ++++ .../test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index c61483a39..ea317c405 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -116,6 +116,10 @@ public static List convertToUrls(Collection requests) { private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { + if (contentType == null) { + return null; + } + Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 6afdeefe4..38c8295bb 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.utils; +import static org.junit.Assert.assertNull; + import org.junit.Assert; import org.junit.Test; @@ -43,5 +45,9 @@ public void testGetDomain(){ Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); } + @Test + public void testGetCharset() { + assertNull(UrlUtils.getCharset(null)); + } } From 5c43e361188fb23f36b1edce9845e10f9386c993 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 4 Jun 2024 00:59:30 +0800 Subject: [PATCH 34/37] Make sure the contentType of detectCharset could be null. --- .../webmagic/utils/CharsetUtilsTest.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java new file mode 100644 index 000000000..987a6f77a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.io.IOException; + +import org.junit.jupiter.api.Test; + +class CharsetUtilsTest { + + @Test + void testDetectCharset() throws IOException { + assertNull(CharsetUtils.detectCharset(null, new byte[0])); + } + +} From 49a5efff46ec604578d6cb98015a8700bdf1fa21 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 4 Jun 2024 01:02:45 +0800 Subject: [PATCH 35/37] Add a private constructor to hide the implicit public one. --- .../main/java/us/codecraft/webmagic/utils/CharsetUtils.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a466..63bb4c110 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -21,6 +21,10 @@ public abstract class CharsetUtils { private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); + private CharsetUtils() { + throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!"); + } + public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { String charset; // charset From 4d0cdb011fc42251c2476bb5f44379d588ae65f5 Mon Sep 17 00:00:00 2001 From: Niu_XZ Date: Mon, 17 Jun 2024 17:27:28 +0800 Subject: [PATCH 36/37] =?UTF-8?q?stopWhenComplete=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E5=8A=A8=E6=80=81=E4=BF=AE=E6=94=B9=E5=AE=8C=E6=88=90?= =?UTF-8?q?=E6=97=B6=E5=81=9C=E6=AD=A2=E6=96=B9=E6=B3=95=E3=80=82=20(#1169?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: niuxiaozu --- .../src/main/java/us/codecraft/webmagic/Spider.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 11a671f7a..a35af70af 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -85,7 +85,7 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected boolean exitWhenComplete = true; + protected volatile boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; @@ -598,6 +598,13 @@ public void stop() { } } + /** + * Stop when all tasks in the queue are completed and all worker threads are also completed + */ + public void stopWhenComplete(){ + this.exitWhenComplete = true; + } + /** * start with more than one threads * From 3e9cd9b5c35a6acf05868cca78caf68f1aec6a40 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Fri, 5 Jul 2024 00:20:28 +0800 Subject: [PATCH 37/37] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 333cf41d6..b96c9a829 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.0-SNAPSHOT + 1.0.0 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f6530b467..6e1d3c896 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c53a30c28..19cdc33d7 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 9290c18fc..15f94cf5e 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index f1da70165..921161362 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2c5bc9597..2530bd81d 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 86e36c7da..3c03aaf8e 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 831cfecf8..a0dc13861 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0