diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..e8ebd6f
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,29 @@
+name: cc-webgraph build
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ java: [ 11, 17, 21 ]
+ name: Java ${{ matrix.java }}
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup JDK
+ uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: ${{ matrix.java }}
+ cache: 'maven'
+
+ - name: Build
+ run: mvn verify javadoc:aggregate
diff --git a/README.md b/README.md
index 91b004a..7b42ede 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,14 @@ Tools to construct and process web graphs from Common Crawl data
## Compiling and Packaging Java Tools
+Java 11 or upwards are required.
+
The Java tools are compiled and packaged by [Maven](https://maven.apache.org/). If Maven is installed just run `mvn package`. Now the Java tools can be run via
```
java -cp target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar ...
```
-The assembly jar file requires Java 10 or upwards to run. It includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to compute [PageRank](https://en.wikipedia.org/wiki/PageRank) and [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality).
+
+The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to compute [PageRank](https://en.wikipedia.org/wiki/PageRank) and [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality).
Note that the webgraphs are usually multiple Gigabytes in size and require a sufficient Java heap size ([Java option](https://docs.oracle.com/en/java/javase/14/docs/specs/man/java.html#extra-options-for-java) `-Xmx`) for processing.
diff --git a/pom.xml b/pom.xml
index d0aa622..491e5e9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -12,17 +12,17 @@
UTF-8
- 1.8
+ 113.6.103.7.02.7.2
- 8.5.12
+ 8.5.131.4
- 2.0.7
+ 2.0.12
- 5.10.0
+ 5.10.2
@@ -34,7 +34,7 @@
maven-compiler-plugin
- 3.8.0
+ 3.13.0${java.version}
@@ -42,7 +42,7 @@
maven-assembly-plugin
- 3.1.1
+ 3.7.1jar-with-dependencies
@@ -60,7 +60,7 @@
maven-surefire-plugin
- 3.0.0-M6
+ 3.2.5
@@ -145,10 +145,30 @@
law${law.version}
+
+ net.sf.jung
+ jung-api
+
+
+ net.sf.jung
+ jung-io
+
+
+ org.apache.httpcomponents
+ httpclient
+
+
+ org.apache.httpcomponents
+ httpasyncclient
+ org.eclipse.jetty.aggregatejetty-all
+
+ org.softee
+ pojo-mbean
+ com.fasterxml.jacksonjackson-bom
diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java
index 25dd102..a47b4be 100644
--- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java
+++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java
@@ -31,9 +31,9 @@
* represented by two text files/streams with tab-separated columns
*
*
vertices
- *
⟨id, revName⟩
+ *
<id, revName>
*
edges
- *
⟨fromId, toId⟩
+ *
<fromId, toId>
*
* Host or domain names are reversed (www.example.com is written as
* com.example.www). The vertices file is sorted lexicographically
@@ -47,19 +47,20 @@
*
*
* Notes, assumptions and preconditions:
+ *
*
- *
host nodes must be sorted lexicographically by reversed host name, see
+ *
host vertices must be sorted lexicographically by reversed host name, see
* above
*
the host-domain map is hold as array. To overcome Java's max array size
* (approx. 2^32 or {@link Arrays#MAX_ARRAY_SIZE}) {@link HostToDomainGraphBig}
- * (based on fastutils' {@link BigArrays}) is automatically used if the array
- * size limit is hit.
+ * (based on fastutils' {@link BigArrays}) is used if the array size limit is
+ * hit by the number of hosts. This number (or an estimate) needs to be known
+ * ahead.
*
the number of resulting domains is limited by Java's max. array size.
* This shouldn't be a problem.
*
also the number of hosts per domain is limited by Java's max. array
- * size
+ * size.
*
- *
*/
public class HostToDomainGraph {
@@ -160,6 +161,11 @@ public int compareTo(Domain o) {
/**
* Whether the domain is safe to output given the reversed domain name seen
* next.
+ *
+ * @param nextDomainRevName next name in lexicographically sorted list of
+ * reversed domain names
+ * @return true if the domain is safe to output, that is from a list of sorted
+ * host names no host later in this list may fold to this domain name
*/
public boolean isSafeToOutput(String nextDomainRevName) {
return isSafeToOutput(this.revName, nextDomainRevName);
@@ -273,7 +279,7 @@ public void multiPartSuffixesAsDomains(boolean include) {
/**
* Reverse host name, eg. www.example.com is reversed to
- * com.example.www. Can be also used to "unreverse" a reversed host
+ * com.example.www. Can also be used to "unreverse" a reversed host
* name.
*
* @param host name
@@ -462,10 +468,12 @@ public HostToDomainGraphBig(long maxSize) {
ids = LongBigArrays.newBigArray(maxSize);
}
+ @Override
protected void setValue(long id, long value) {
BigArrays.set(ids, id, value);
}
+ @Override
protected long getValue(long id) {
return BigArrays.get(ids, id);
}
diff --git a/src/script/host2domaingraph.sh b/src/script/host2domaingraph.sh
index 791abc5..3bfbc11 100755
--- a/src/script/host2domaingraph.sh
+++ b/src/script/host2domaingraph.sh
@@ -28,8 +28,8 @@ if [ $# -lt 3 ]; then
if [ ${#FLAGS[@]} -gt 0 ]; then
echo ""
echo "Calling HostToDomainGraph with provided flags (${FLAGS[*]}):"
- "$JAVA_HOME"/bin/java -cp "$CLASSPATH":"JAR" \
- "${PROPERTIES[@]}" org.commoncrawl.webgraph.HostToDomainGraph "${FLAGS[@]}"
+ "$JAVA_HOME"/bin/java -cp "$CLASSPATH":"$JAR" "${PROPERTIES[@]}" \
+ org.commoncrawl.webgraph.HostToDomainGraph "${FLAGS[@]}"
fi
exit 1
fi