diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..e8ebd6f --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,29 @@ +name: cc-webgraph build + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + java: [ 11, 17, 21 ] + name: Java ${{ matrix.java }} + steps: + - uses: actions/checkout@v4 + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: ${{ matrix.java }} + cache: 'maven' + + - name: Build + run: mvn verify javadoc:aggregate diff --git a/README.md b/README.md index 91b004a..7b42ede 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,14 @@ Tools to construct and process web graphs from Common Crawl data ## Compiling and Packaging Java Tools +Java 11 or upwards are required. + The Java tools are compiled and packaged by [Maven](https://maven.apache.org/). If Maven is installed just run `mvn package`. Now the Java tools can be run via ``` java -cp target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar ... ``` -The assembly jar file requires Java 10 or upwards to run. It includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to compute [PageRank](https://en.wikipedia.org/wiki/PageRank) and [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality). + +The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to compute [PageRank](https://en.wikipedia.org/wiki/PageRank) and [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality). Note that the webgraphs are usually multiple Gigabytes in size and require a sufficient Java heap size ([Java option](https://docs.oracle.com/en/java/javase/14/docs/specs/man/java.html#extra-options-for-java) `-Xmx`) for processing. diff --git a/pom.xml b/pom.xml index d0aa622..491e5e9 100644 --- a/pom.xml +++ b/pom.xml @@ -12,17 +12,17 @@ UTF-8 - 1.8 + 11 3.6.10 3.7.0 2.7.2 - 8.5.12 + 8.5.13 1.4 - 2.0.7 + 2.0.12 - 5.10.0 + 5.10.2 @@ -34,7 +34,7 @@ maven-compiler-plugin - 3.8.0 + 3.13.0 ${java.version} ${java.version} @@ -42,7 +42,7 @@ maven-assembly-plugin - 3.1.1 + 3.7.1 jar-with-dependencies @@ -60,7 +60,7 @@ maven-surefire-plugin - 3.0.0-M6 + 3.2.5 @@ -145,10 +145,30 @@ law ${law.version} + + net.sf.jung + jung-api + + + net.sf.jung + jung-io + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + httpasyncclient + org.eclipse.jetty.aggregate jetty-all + + org.softee + pojo-mbean + com.fasterxml.jackson jackson-bom diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java index 25dd102..a47b4be 100644 --- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java +++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java @@ -31,9 +31,9 @@ * represented by two text files/streams with tab-separated columns *
*
vertices
- *
⟨id, revName⟩
+ *
<id, revName>
*
edges
- *
⟨fromId, toId⟩
+ *
<fromId, toId>
*
* Host or domain names are reversed (www.example.com is written as * com.example.www). The vertices file is sorted lexicographically @@ -47,19 +47,20 @@ * *

* Notes, assumptions and preconditions: + *

*
    - *
  • host nodes must be sorted lexicographically by reversed host name, see + *
  • host vertices must be sorted lexicographically by reversed host name, see * above
  • *
  • the host-domain map is hold as array. To overcome Java's max array size * (approx. 2^32 or {@link Arrays#MAX_ARRAY_SIZE}) {@link HostToDomainGraphBig} - * (based on fastutils' {@link BigArrays}) is automatically used if the array - * size limit is hit.
  • + * (based on fastutils' {@link BigArrays}) is used if the array size limit is + * hit by the number of hosts. This number (or an estimate) needs to be known + * ahead. *
  • the number of resulting domains is limited by Java's max. array size. * This shouldn't be a problem.
  • *
  • also the number of hosts per domain is limited by Java's max. array - * size
  • + * size. *
- *

*/ public class HostToDomainGraph { @@ -160,6 +161,11 @@ public int compareTo(Domain o) { /** * Whether the domain is safe to output given the reversed domain name seen * next. + * + * @param nextDomainRevName next name in lexicographically sorted list of + * reversed domain names + * @return true if the domain is safe to output, that is from a list of sorted + * host names no host later in this list may fold to this domain name */ public boolean isSafeToOutput(String nextDomainRevName) { return isSafeToOutput(this.revName, nextDomainRevName); @@ -273,7 +279,7 @@ public void multiPartSuffixesAsDomains(boolean include) { /** * Reverse host name, eg. www.example.com is reversed to - * com.example.www. Can be also used to "unreverse" a reversed host + * com.example.www. Can also be used to "unreverse" a reversed host * name. * * @param host name @@ -462,10 +468,12 @@ public HostToDomainGraphBig(long maxSize) { ids = LongBigArrays.newBigArray(maxSize); } + @Override protected void setValue(long id, long value) { BigArrays.set(ids, id, value); } + @Override protected long getValue(long id) { return BigArrays.get(ids, id); } diff --git a/src/script/host2domaingraph.sh b/src/script/host2domaingraph.sh index 791abc5..3bfbc11 100755 --- a/src/script/host2domaingraph.sh +++ b/src/script/host2domaingraph.sh @@ -28,8 +28,8 @@ if [ $# -lt 3 ]; then if [ ${#FLAGS[@]} -gt 0 ]; then echo "" echo "Calling HostToDomainGraph with provided flags (${FLAGS[*]}):" - "$JAVA_HOME"/bin/java -cp "$CLASSPATH":"JAR" \ - "${PROPERTIES[@]}" org.commoncrawl.webgraph.HostToDomainGraph "${FLAGS[@]}" + "$JAVA_HOME"/bin/java -cp "$CLASSPATH":"$JAR" "${PROPERTIES[@]}" \ + org.commoncrawl.webgraph.HostToDomainGraph "${FLAGS[@]}" fi exit 1 fi