From 7a48c844645893f2af92f7ab51c4dc4fc0cd6a06 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 12 Jun 2019 13:52:11 +0200 Subject: [PATCH 1/4] #1384 - Update datasets API - Add SHA512 hash support - Update dataset descriptions - Add option to calculate hash sum of plain text files using normalized whitespace - Added option to set a default verification policy via a system property - Fixed a few bad dataset descriptions - Updated documentation regarding the new verificationMode option and SHA512 - Improved log messages --- .../api/datasets/ArtifactDescription.java | 15 ++ .../core/api/datasets/DatasetFactory.java | 167 ++++++++++++------ .../api/datasets/DatasetValidationPolicy.java | 2 +- .../core/api/datasets/VerificationMode.java | 31 ++++ .../internal/ArtifactDescriptionImpl.java | 40 +++++ .../datasets/internal/actions/Explode.java | 6 +- .../lib/alpino-conll-nl-20100114.yaml | 6 +- .../core/api/datasets/lib/aqmar-ar-1.0.yaml | 5 +- .../api/datasets/lib/brown-en-teixml.yaml | 1 + .../core/api/datasets/lib/cdt-conll-da-1.yaml | 5 +- .../core/api/datasets/lib/conll2000-en.yaml | 2 + .../core/api/datasets/lib/conll2002-es.yaml | 3 +- .../core/api/datasets/lib/conll2002-nl.yaml | 1 + .../core/api/datasets/lib/conll2006-pt.yaml | 15 +- .../core/api/datasets/lib/conll2009-ca.yaml | 1 + .../core/api/datasets/lib/conll2009-de.yaml | 2 +- .../core/api/datasets/lib/conll2009-es.yaml | 1 + .../core/api/datasets/lib/conll2009-ja.yaml | 1 + .../datasets/lib/coptictb-conll-cop-1.0.yaml | 5 +- .../core/api/datasets/lib/finntb-fi-3.1.yaml | 3 + .../api/datasets/lib/germeval2014-de.yaml | 7 +- .../datasets/lib/glove.6B-en-20151025.yaml | 13 +- .../lib/gum-dep-stanford-en-4.1.0.yaml | 1 + .../api/datasets/lib/gum-en-conll-2.2.0.yaml | 1 + .../api/datasets/lib/gum-en-conll-2.3.2.yaml | 1 + .../api/datasets/lib/gum-en-conll-3.0.0.yaml | 1 + .../api/datasets/lib/hdt-de-conll-1.0.1.yaml | 5 +- .../core/api/datasets/lib/iulatb-es-1.yaml | 5 +- .../datasets/lib/jos100k-conll-sl-2.0.yaml | 3 + .../datasets/lib/masc-conll-en-20080522.yaml | 1 + .../core/api/datasets/lib/ndt-nb-1.01.yaml | 5 +- .../core/api/datasets/lib/ndt-nn-1.01.yaml | 5 +- .../core/api/datasets/lib/nemgp-de-0.1.yaml | 7 +- .../core/api/datasets/lib/perseus-el-2.1.yaml | 3 + .../core/api/datasets/lib/perseus-la-2.1.yaml | 3 + .../core/api/datasets/lib/poldb-pl-0.5.yaml | 5 +- .../core/api/datasets/lib/poltb-pl-0.5.yaml | 5 +- .../api/datasets/lib/sdt-conll-sl-0.1.yaml | 1 + .../api/datasets/lib/sdt-conll-sl-0.4.yaml | 5 +- .../lib/sequoia-surf-conll-fr-7.0.yaml | 1 + .../core/api/datasets/lib/sethr-hr-1.yaml | 3 + .../datasets/lib/sethrplus-hr-20160613.yaml | 9 +- ...nford-egw4-reut-512-clusters-20130608.yaml | 1 + .../datasets/lib/talkbanken05-dep-sv-1.1.yaml | 3 +- .../datasets/lib/talkbanken05-dps-sv-1.1.yaml | 3 +- .../datasets/lib/talkbanken05-fps-sv-1.1.yaml | 3 +- .../lib/tedtreebank-conll-en-1.0.yaml | 5 +- .../datasets/lib/tut-conll-it-20101122.yaml | 5 + .../api/datasets/lib/ud-en-conllu-1.4.yaml | 1 + .../core/api/datasets/lib/updt-fa-1.3.yaml | 11 +- .../core/api/datasets/lib/wasr-de-1.00.yaml | 7 +- .../core/api/datasets/lib/wasr-l-en-1.00.yaml | 7 +- .../api/datasets/lib/wasr-xl-en-1.00.yaml | 13 +- .../core/api/datasets/DatasetFactoryTest.java | 19 +- .../main/asciidoc/user-guide/datasets.adoc | 7 +- 55 files changed, 371 insertions(+), 116 deletions(-) create mode 100644 dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java index b78d71b3be..075e53fd5a 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java @@ -21,6 +21,11 @@ public interface ArtifactDescription { + /** + * @return the dataset to which this artifact belongs. + */ + DatasetDescription getDataset(); + /** * @return artifact name/ID */ @@ -43,6 +48,16 @@ public interface ArtifactDescription * @return SHA1 hash of the artifact. */ String getSha1(); + + /** + * @return SHA512 hash of the artifact. + */ + String getSha512(); + + /** + * @return the verification mode. + */ + VerificationMode getVerificationMode(); /** * Whether this artifact is shared between multiple datasets. If this flag is enabled, the diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java index bcd920f52a..494c9bddf9 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java @@ -17,6 +17,7 @@ */ package org.dkpro.core.api.datasets; +import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Collections.unmodifiableList; import java.io.File; @@ -47,6 +48,7 @@ import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.NullOutputStream; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dkpro.core.api.datasets.internal.ActionDescriptionImpl; @@ -64,9 +66,15 @@ public class DatasetFactory { + public static final String PROP_DATASET_VERIFICATION_POLICY = "dkpro.dataset.verification.policy"; + + private static final DatasetValidationPolicy defaultVerificationPolicy = DatasetValidationPolicy + .valueOf(System.getProperty(PROP_DATASET_VERIFICATION_POLICY, + DatasetValidationPolicy.STRICT.name())); + private Map datasets; - private Map> actionRegistry; + private final Map> actionRegistry; private final Log LOG = LogFactory.getLog(getClass()); @@ -112,7 +120,7 @@ public DatasetDescription getDescription(String aId) public Dataset load(String aId) throws IOException { - return load(aId, DatasetValidationPolicy.STRICT); + return load(aId, defaultVerificationPolicy); } public Dataset load(String aId, DatasetValidationPolicy aPolicy) @@ -197,6 +205,7 @@ private Map loadFromYaml() // Inject artifact names into artifacts for (Entry e : ds.getArtifacts().entrySet()) { ((ArtifactDescriptionImpl) e.getValue()).setName(e.getKey()); + ((ArtifactDescriptionImpl) e.getValue()).setDataset(ds); } sets.put(ds.getId(), ds); @@ -244,17 +253,12 @@ private void materialize(DatasetDescription aDataset, DatasetValidationPolicy aP continue; } - if (artifact.getSha1() != null) { - String actual = getDigest(cachedFile, "SHA1"); - if (!artifact.getSha1().equals(actual)) { - LOG.info("Local SHA1 hash mismatch on [" + cachedFile + "] - expected [" - + artifact.getSha1() + "] - actual [" + actual + "]"); + if (artifact.getUrl() != null) { + boolean verificationOk = checkDigest(cachedFile, artifact); + if (!verificationOk) { reload = true; break packageValidationLoop; } - else { - LOG.info("Local SHA1 hash verified on [" + cachedFile + "] - [" + actual + "]"); - } } } @@ -272,30 +276,32 @@ private void materialize(DatasetDescription aDataset, DatasetValidationPolicy aP for (ArtifactDescription artifact : artifacts) { Path cachedFile = resolve(aDataset, artifact); - if (Files.exists(cachedFile)) { - continue; - } - if (artifact.getText() != null) { + // Check if file on disk corresponds to text stored in artifact description + if (Files.exists(cachedFile)) { + String text = FileUtils.readFileToString(cachedFile.toFile(), UTF_8); + text = StringUtils.normalizeSpace(text); + if (StringUtils.normalizeSpace(artifact.getText()).equals(text)) { + continue; + } + } + Files.createDirectories(cachedFile.getParent()); LOG.info("Creating [" + cachedFile + "]"); try (Writer out = Files.newBufferedWriter(cachedFile, StandardCharsets.UTF_8)) { out.write(artifact.getText()); } + continue; } if (artifact.getUrl() != null) { + if (Files.exists(cachedFile)) { + continue; + } + Files.createDirectories(cachedFile.getParent()); - MessageDigest sha1; - try { - sha1 = MessageDigest.getInstance("SHA1"); - } - catch (NoSuchAlgorithmException e) { - throw new IOException(e); - } - URL source = new URL(artifact.getUrl()); LOG.info("Fetching [" + cachedFile + "]"); @@ -304,29 +310,25 @@ private void materialize(DatasetDescription aDataset, DatasetValidationPolicy aP connection.setRequestProperty("User-Agent", "Java"); try (InputStream is = connection.getInputStream()) { - DigestInputStream sha1Filter = new DigestInputStream(is, sha1); - Files.copy(sha1Filter, cachedFile); + Files.copy(is, cachedFile); + } - if (artifact.getSha1() != null) { - String sha1Hex = new String( - Hex.encodeHex(sha1Filter.getMessageDigest().digest())); - if (!artifact.getSha1().equals(sha1Hex)) { - String message = "SHA1 mismatch. Expected [" + artifact.getSha1() - + "] but got [" + sha1Hex + "]."; - switch (aPolicy) { - case STRICT: - LOG.error(message + " STRICT policy in effect. Bailing out."); - throw new IOException(message); - case CONTINUE: - LOG.warn(message + " CONTINUE policy in effect. Ignoring mismatch."); - break; - case DESPERATE: - LOG.warn(message + " DESPERATE policy in effect. Ignoring mismatch."); - break; - default: - throw new IllegalArgumentException("Unknown policy: " + aPolicy); - } - } + boolean verificationOk = checkDigest(cachedFile, artifact); + if (!verificationOk) { + switch (aPolicy) { + case STRICT: + throw new IOException("Checksum verification failed on [" + cachedFile + + "] STRICT policy in effect. Bailing out."); + case CONTINUE: + LOG.warn("Checksum verification failed on [" + cachedFile + + "] CONTINUE policy in effect. Ignoring mismatch."); + break; + case DESPERATE: + LOG.warn("Checksum verification failed on [" + cachedFile + + "] DESPERATE policy in effect. Ignoring mismatch."); + break; + default: + throw new IllegalArgumentException("Unknown policy: " + aPolicy); } } } @@ -355,10 +357,7 @@ private void materialize(DatasetDescription aDataset, DatasetValidationPolicy aP impl.apply(action, aDataset, artifact, cachedFile); } } - catch (IllegalStateException e) { - throw e; - } - catch (IOException e) { + catch (IllegalStateException | IOException e) { throw e; } catch (Exception e) { @@ -369,20 +368,78 @@ private void materialize(DatasetDescription aDataset, DatasetValidationPolicy aP Files.createFile(postActionCompleteMarker); } } + + private InputStream getDigestInputStream(Path aFile, ArtifactDescription aArtifact) + throws IOException + { + switch (aArtifact.getVerificationMode()) { + case BINARY: + return Files.newInputStream(aFile); + case TEXT: + String text = FileUtils.readFileToString(aFile.toFile(), UTF_8); + text = StringUtils.normalizeSpace(text); + return IOUtils.toInputStream(text, UTF_8); + default: + throw new IllegalArgumentException( + "Unknown verification mode [" + aArtifact.getVerificationMode() + "]"); + } + } - private String getDigest(Path aFile, String aDigest) throws IOException + private boolean checkDigest(Path aFile, ArtifactDescription aArtifact) throws IOException { - MessageDigest digest; + MessageDigest sha1; + MessageDigest sha512; try { - digest = MessageDigest.getInstance(aDigest); + sha1 = MessageDigest.getInstance("SHA-1"); + sha512 = MessageDigest.getInstance("SHA-512"); } catch (NoSuchAlgorithmException e) { throw new IOException(e); } - try (InputStream is = Files.newInputStream(aFile)) { - DigestInputStream digestFilter = new DigestInputStream(is, digest); - IOUtils.copy(digestFilter, new NullOutputStream()); - return new String(Hex.encodeHex(digestFilter.getMessageDigest().digest())); + + try (InputStream is = getDigestInputStream(aFile, aArtifact)) { + DigestInputStream sha1Filter = new DigestInputStream(is, sha1); + DigestInputStream sha512Filter = new DigestInputStream(sha1Filter, sha512); + IOUtils.copy(sha512Filter, new NullOutputStream()); + String sha1Hash = new String(Hex.encodeHex(sha1Filter.getMessageDigest().digest())); + String sha512Hash = new String(Hex.encodeHex(sha512Filter.getMessageDigest().digest())); + + if (aArtifact.getSha1() != null) { + if (!sha1Hash.equals(aArtifact.getSha1())) { + LOG.info("Local SHA1 hash mismatch for artifact [" + aArtifact.getName() + + "] in dataset [" + aArtifact.getDataset().getId() + "] - expected [" + + aArtifact.getSha1() + "] - actual [" + sha1Hash + "] (mode: " + + aArtifact.getVerificationMode() + ")"); + return false; + } + else if (aArtifact.getSha512() == null) { + LOG.info("Local SHA1 hash verified for artifact [" + aArtifact.getName() + + "] in dataset [" + aArtifact.getDataset().getId() + "] (mode: " + + aArtifact.getVerificationMode() + ")"); + } + } + + if (aArtifact.getSha512() != null) { + if (!sha512Hash.equals(aArtifact.getSha512())) { + LOG.info("Local SHA512 hash mismatch for artifact [" + aArtifact.getName() + + "] in dataset [" + aArtifact.getDataset().getId() + "] - expected [" + + aArtifact.getSha512() + "] - actual [" + sha512Hash + "] (mode: " + + aArtifact.getVerificationMode() + ")"); + return false; + } + else { + LOG.info("Local SHA512 hash verified for artifact [" + aArtifact.getName() + + "] in dataset [" + aArtifact.getDataset().getId() + "] (mode: " + + aArtifact.getVerificationMode() + ")"); + } + } + else { + LOG.info("No SHA512 hash for artifact [" + aArtifact.getName() + "] in dataset [" + + aArtifact.getDataset().getId() + "] - it is recommended to add it: [" + + sha512Hash + "]"); + } + + return true; } } } diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java index a11691e694..a0b9e9fb66 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java @@ -27,7 +27,7 @@ public enum DatasetValidationPolicy /** * If the local hash does not match if there is no local data, download it. If the freshly - * downloaded data does not match, contine. + * downloaded data does not match, continue. */ CONTINUE, diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java new file mode 100644 index 0000000000..6ecf72ef97 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java @@ -0,0 +1,31 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets; + +public enum VerificationMode +{ + /** + * Calculate the hash based on the binary content of the file. + */ + BINARY, + + /** + * Normalize whitespace before calculating the hash. + */ + TEXT; +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java index f47a215185..5825fe2ae0 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java @@ -17,21 +17,39 @@ */ package org.dkpro.core.api.datasets.internal; +import static org.dkpro.core.api.datasets.VerificationMode.BINARY; + import java.util.List; import org.dkpro.core.api.datasets.ActionDescription; import org.dkpro.core.api.datasets.ArtifactDescription; +import org.dkpro.core.api.datasets.DatasetDescription; +import org.dkpro.core.api.datasets.VerificationMode; public class ArtifactDescriptionImpl implements ArtifactDescription { + private DatasetDescription dataset; private String name; private String text; private String url; private String sha1; + private String sha512; + private VerificationMode verificationMode = BINARY; private boolean shared; private List actions; + @Override + public DatasetDescription getDataset() + { + return dataset; + } + + public void setDataset(DatasetDescription aDataset) + { + dataset = aDataset; + } + @Override public String getName() { @@ -76,6 +94,28 @@ public void setSha1(String aSha1) sha1 = aSha1; } + @Override + public String getSha512() + { + return sha512; + } + + public void setSha512(String aSha512) + { + sha512 = aSha512; + } + + @Override + public VerificationMode getVerificationMode() + { + return verificationMode; + } + + public void setVerificationMode(VerificationMode aVerificationMode) + { + verificationMode = aVerificationMode; + } + @Override public List getActions() { diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java index 83ef0e9b66..7bede6f9f1 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java @@ -31,6 +31,7 @@ import java.util.Map; import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; @@ -75,7 +76,7 @@ public void apply(ActionDescription aAction, DatasetDescription aDataset, if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".rar")) { extractRar(aAction, targetFile, dsi.getOwner().resolve(dsi)); } - if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".7z")) { + else if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".7z")) { // 7z does not support streaming in Apache Commons Compress extract7z(aAction, targetFile, dsi.getOwner().resolve(dsi)); } @@ -99,6 +100,9 @@ public void apply(ActionDescription aAction, DatasetDescription aDataset, .createArchiveInputStream(uncompressed); extract(aAction, targetFile, archive, dsi.getOwner().resolve(dsi)); } + catch (ArchiveException e) { + throw new ArchiveException("Unable to extract files from [" + targetFile + "]", e); + } } } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml index 77df8f7a8f..c4fcfc94fb 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml @@ -21,10 +21,12 @@ description: | artifacts: cdb.conll.utf8: url: http://www.let.rug.nl/~bplank/alpino2conll/data/cdb.conll.utf8 - sha1: 11313d405abb0f268247a2d5420afa413eb244e7 + sha1: f5e1517383f4489c8cb0c75ad202ac57c21874fc + sha512: d3702175a3a233cd3b158ae5854ec28ae0bc058108b7c0dac071eff01383501d79c098ae3eeb04c516e105d07bb3dc69bbb7aef6eb19f97607ad73931ee80a48 conll2006-test.conll: url: http://www.let.rug.nl/~bplank/alpino2conll/data/conll2006-test.conll - sha1: 11313d405abb0f268247a2d5420afa413eb244e7 + sha1: c055154ae56dfa8c29d304ed852af90aedf00a5d + sha512: 34792f773363d4b25b396748fd78b14c0e88fc46793800148fa59855835077b1a2a67f56d4989fb58407a4dc79afdc8e09ab803b9dc32ceac22bd9f9d9a4725b roles: data: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml index 41e37f3168..e966f196e9 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml @@ -23,10 +23,13 @@ licenses: artifacts: LICENSE.txt: url: http://www.cs.cmu.edu/~ark/ArabicNER/corpus/LICENSE - sha1: 43f4082fb8432ad86d927bdff687f9406db43d0f + sha1: 54977f4065ec070057e99b4b446273e5c8f071d2 + sha512: 10ebe8ff7e3e41c65ff1ce412c6af0dc5bde5eedd1847440e82d50629f102ed7f8d1af24e551ea5c7b2bb846f186edeeda5d0bc853774e41cdb70b78a5158180 + verificationMode: TEXT data.zip: url: "http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip" sha1: 4fa2c37d7673bb456c6e382566a091545531d85f + sha512: 3936cbc9a0e8f07090cab1cac27b348352bafccf427a47d5257b6975e0231b27c7e62c8d86d22a0c533310bdcbebd7cbc1ae91c727265727bc1ca0dd540a6b4c actions: - action: explode configuration: { includes: "*.txt" } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml index 77f68e3ab7..9d00638e78 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml @@ -38,6 +38,7 @@ artifacts: brown.zip: url: "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown_tei.zip" sha1: 1e4eadeb358f6f7e6ac9b3677a82f4353bbe91ed + sha512: f3dcc36bcab63d481e4d833c8946f10163f732166114c8fdd63932fff9fba3c236593a082ebcdf96f74aea6d33e424b7be4c645fd0f5ee5090f0335544c02c47 actions: - action: explode configuration: { excludes: "Corpus.xml", strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml index 18cbf66717..b8a93dcd6c 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml @@ -29,10 +29,13 @@ licenses: artifacts: LICENSE.txt: url: https://www.gnu.org/licenses/gpl-2.0.txt - sha1: 4cc77b90af91e615a64ae04893fdffa7939db84c + sha1: 0e5aad9553dc0ed784ec220bb09e22d52fefbb8b + sha512: 7881dbc2d75fd63161fa31c2209b21c7858e9664c3cab00fcae14bddac91bded9eba2f34252b5d734fa3f98c35e6ed3a388044eba3bbb746233aafac182cc442 + verificationMode: TEXT data.zip: url: https://github.com/mbkromann/copenhagen-dependency-treebank/archive/2fa64f811364db42407fb4bcdd2189d4ee33bda1.zip sha1: 11313d405abb0f268247a2d5420afa413eb244e7 + sha512: 9184e3bb3e07caffd932f38060a37d80aa294f6b3c05cd68754ed46d8a82cb892b94a348e1ccf17104740daf900a2f7a7adda4bce35615e8138970ee949e3da5 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml index b9a166d589..5088641c96 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml @@ -34,6 +34,8 @@ artifacts: train.txt.gz: url: "https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz" sha1: 9f31cf936554cebf558d07cce923dca0b7f31864 + sha512: 02a9be73f1bdd3d654ec2337cb64b358f0f6df0428b5da167aa95462d4fe06f4834ddd2dbad8cd2dd6eeb06d759379ae94f7dd2790e06f7d334afad902ec233c test.txt.gz: url: "https://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz" sha1: dc57527f1f60eeafad03da51235185141152f849 + sha512: 2668c8e4025cfc8067d78c5a6ef08ffa8d66883a351faeb995314a7832801b96c5997cac2117c6a34b0ed58c38073f00091b629e11a2341945c2e835a7410b5c diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml index 403fe3c94b..3e121f012d 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml @@ -23,8 +23,9 @@ description: | artifacts: data.tgz: - url: "http://www.cnts.ua.ac.be/conll2002/ner.tgz" + url: "https://web.archive.org/web/20170307123302if_/http://www.cnts.ua.ac.be/conll2002/ner.tgz" sha1: 686ef8fed3125a1d8aefe1351ff0e619fe9c34cb + sha512: 61a7423b1fb2bd3dac0f85b37e56a04b26d0aa8443d707191c93a9ea83da9990edab4eb71e689bd223bb38504208a17b750cbec94e436362c9f7c524da8b8e64 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml index 94c0c1d6e8..831948c3f5 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml @@ -22,6 +22,7 @@ artifacts: data.tgz: url: "http://www.cnts.ua.ac.be/conll2002/ner.tgz" sha1: 686ef8fed3125a1d8aefe1351ff0e619fe9c34cb + sha512: 61a7423b1fb2bd3dac0f85b37e56a04b26d0aa8443d707191c93a9ea83da9990edab4eb71e689bd223bb38504208a17b750cbec94e436362c9f7c524da8b8e64 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml index 1fe6efb046..2a3502935b 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml @@ -24,17 +24,22 @@ licenses: artifacts: README.txt: - url: http://www.linguateca.pt/floresta/CoNLL-X/readme.conll - sha1: 7afe672cba645d22fc037d8f6e2bf9d501d0aee6 + url: https://www.linguateca.pt/floresta/CoNLL-X/readme.conll + sha1: 10da89fed0ecb888c8fc7fad350b1a11bb9050d7 + sha512: 178e28f6d7e13728412736659e4afc2c46302f16c2bff860e39516707aeb5acc072bc3f0ab1852e35853b240051985b682e2e64f3483175bf941cf512cfc1b53 + verificationMode: TEXT portuguese_bosque_train.conll: - url: http://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_train.conll + url: https://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_train.conll sha1: 29e630e207c74a42e0d2999193aa25d73f262920 + sha512: 32efcaece5c81e6b2fb31efeba09613ae50374c0b89c83969f2c43ca3b2a527e6944a0727e2983e374824264ef0f0c398c18a5a1f75c42764d415fca0755e524 portuguese_bosque_test_blind.conll: - url: http://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test_blind.conll + url: https://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test_blind.conll sha1: fabcfbd73a531e21786af9b8233f1a4aa78dfddb + sha512: fbd3382dfb5acd2a34d5ecd5cbe449c33e96b8618b69d1de772f70a799cfb5b684c209f193474e63683033528af4e5111c8c3c0af4168c3d121b448630bce424 portuguese_bosque_test.conll: - url: http://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test.conll + url: https://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test.conll sha1: e399cdc1203df1ff43816f3f934223cb9a625992 + sha512: 8d600d1158d87f446c2814f5adb74b8a7380cdffbf4a62e971d7d7775702b9574506b92fcfe347f0572976447a20b6499b8693ac412a89335fa71c00a1269730 roles: training: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml index e8eb8c49b3..53ce9b3d83 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml @@ -30,6 +30,7 @@ artifacts: data.zip: url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-Catalan-traindevC.zip" sha1: 500cbb81709012cce4d23bfa72d93c320b0b7e6f + sha512: de862ccb6ffca557453dc6d631d6b7b0125724aa56c357e67ebc38d792f866dc563dfd2dceca8c67050d4018759e499d966f19bca90048c303a4324c65a45d4d actions: - action: explode configuration: { strip: 1, includes: ["README.TXT", "datasets/*" ]} diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml index 5d48189021..ea9e0e59aa 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml @@ -35,7 +35,7 @@ artifacts: # URL of the artifact url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-German-traindevB.zip" # Checksum used to validate download and cache integrity - sha1: ad4c03c3c4e4668c8beb34c399e71f539e6d633d + sha512: ae037e60d1065c72fdf7aca6507d1249538c9ae7f2f3662305787da8bd60afa810c43226d7b36712c627d371ab8dd3e01dd6565c65971ac9479584dcaaedb6dd actions: - action: explode # Extract archive after downloading configuration: { strip: 1 } # Remove one leading path element while extracting diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml index 8bfc6cb0aa..2a46fd28cf 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml @@ -30,6 +30,7 @@ artifacts: data.zip: url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-Spanish-traindevB.zip" sha1: ef36c3369bd05966609b4b13d6bf78884c23ece1 + sha512: 2b20574c36c684bd2e406f6356298f9f853366e8627866d60e87e7d95c8d87f2b159df2dffc8ac6a632bd833ce36b0898dd7cde8e375a314ce5e7a546bcdb594 actions: - action: explode configuration: { strip: 1, excludes: [ "documentation", "documentation/**/*" ] } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml index a8b58d8ead..971033a05f 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml @@ -24,6 +24,7 @@ artifacts: data.zip: url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-Japanese-traindevA.zip" sha1: 8c96a1eda2527a9ba1bf37dd4125cc6af11e7dd4 + sha512: 135eb63b727e0a8b77da72af32bf5f5ec84a2bda3d7e44866a2e5091d7d23b7e723fd5540d839b3e0a3e60bd16696d1ecb2a66ddcdc020447ec8220420c4971b actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml index b8d3462585..34f564a5fd 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml @@ -19,10 +19,13 @@ licenses: artifacts: LICENSE.txt: url: "https://github.com/CopticScriptorium/corpora/raw/3506b54ce769639c382145161da4f99fd3d6156b/coptic-treebank/LICENSE.txt" - sha1: fc0bdc662ce901ac2c631f9574c9aa8b54ebf8c7 + sha1: 3015e20629818d25c34527d59808e716fd0d8ced + sha512: 6660c7fa3b570110e5ae641b169ecea50582e7ebc4214d111cdd46b783937e3d1165b92a38ef6faca2172813781b27f372117665af9097fe9c658a66ccbe87c8 + verificationMode: TEXT coptic.treebank.conll10: url: "https://github.com/CopticScriptorium/corpora/raw/3506b54ce769639c382145161da4f99fd3d6156b/coptic-treebank/coptic.treebank.conll10" sha1: 8c363df27408cb14cb42f3869916c1575fe1625a + sha512: da1c89705b7ceb1922fd3c91720f57e4b9326401e60ae6bbae3a17fd6ebf884ede61cf92fdd4d150cb6c9a36fe414fad39a49e196b9880c354ee900a817988c7 roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml index 6c933ec3a9..5b8c325bf6 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml @@ -26,9 +26,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT ftb3.1.conllx.gz: url: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/ftb3.1.conllx.gz sha1: 7c58064bf9995980cea08e84035c0414adc54f06 + sha512: 62a4661d032e155b6f203493498cf761b952bb902de4907c61b5d0d704c74b9a31ee3db553402ccb45f225209e15d475dcecaf8613579fd940d2f89762548c89 roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml index 462e815a9d..958398309d 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml @@ -30,16 +30,21 @@ licenses: artifacts: LICENSE.txt: url: "https://creativecommons.org/licenses/by/4.0/legalcode.txt" - sha1: 563e2664fed2ce3e65bd1dd396422f46c5db9040 + sha1: 9c5bee7a22ab39ad6c19ab29ea9e94ac5874f9c6 + sha512: 222cf997169925ee3a09a26798d04332673693c578c24cb2d0cc550785a8b87849b161dccd9c48d2e4f3fa15290b6a10ac5262945f9c8cc6bdbd362d37416300 + verificationMode: TEXT NER-de-dev.tsv: url: "https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1" sha1: 70aba5d247f51ec22e0bcc671c7fb325e4ff4277 + sha512: 322f6e931988374d51eb8cb43171a8fbb612fd15d3255167f33c23389736dd21f8ac06354cd7509db94c1e16a8cedf40c68fe4a3547018d50bc343b023fdb90b NER-de-test.tsv: url: "https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1" sha1: 214deaf091e01567af2e958aac87863bf685342a + sha512: b835f9d8267873ea7128efeef263eaed7ecd786544dcdd00a599b69f3ed20b98460bb3ab62e939eda56bbe3d629c7605bb9fda97cc83300c7175e6748e6d1d8c NER-de-train.tsv: url: "https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1" sha1: 7644cb09676050c0a2836e06fa0aeb8509b9e1cb + sha512: ed0ed7b883667796386174b1b795015b6c8f690bd84d861d1b825b80c869126ddd158eb9a5f8dc7ae0df037ea9aef615db0e829766e9304a21a3f513d57f984f roles: training: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml index 0838503dbd..d7bf83e225 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml @@ -22,16 +22,17 @@ licenses: url: http://www.opendatacommons.org/licenses/pddl/1.0/ artifacts: - glove.6B.zip: - url: "https://nlp.stanford.edu/data/glove.6B.zip" + data.zip: + url: "http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip" sha1: b64e54f1877d2f735bdd000c1d7d771e25c7dfdc + sha512: 8a600c0df42436554d034d23d6d82f51b7c2e4ab8a3e3554b403bac951c9c600a2ef5612d89b2ed59ce8aecaed3c4c1d53a4e9e2a696999b95e64af267a8752e actions: - action: explode roles: data: - - glove.6B.50d.txt - - glove.6B.100d.txt - - glove.6B.200d.txt - - glove.6B.300d.txt + - data/glove.6B.50d.txt + - data/glove.6B.100d.txt + - data/glove.6B.200d.txt + - data/glove.6B.300d.txt \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml index a051a6d036..d05593f366 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml @@ -48,6 +48,7 @@ artifacts: gum.tar.gz: url: "https://github.com/amir-zeldes/gum/archive/V4.1.0.tar.gz" sha1: 91ded1ba5b6c05fe8e70e42a0a36ee0d20556888 + sha512: 4ca7a346f2f8d344db0ac798152fbafbf6fbb794047574f5dd0475050179a69ae9972312babae3c6fada9b4fcd313b1167f83e6a70bc6f292ce721bb12d2f3c6 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml index a5e3d484e3..4f7c9f8dd1 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml @@ -42,6 +42,7 @@ artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V2.2.0.zip" sha1: b17e276998ced83153be605d8157afacf1f10fdc + sha512: c9606ba69ec1152267b8c801510f251cdcff1b835a53fd5bf9416800499bb6201a731039a6fdaf1baebf4f3048b325034d267485bd3f7dc3633443f9a16e00c3 actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml index c979d0050d..eb3bbf4246 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml @@ -42,6 +42,7 @@ artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V2.3.2.zip" sha1: 471c3a35c2a0e9aee4bbff9a9cf05441fce3ef21 + sha512: 713d731714ff037ab79ccc9db34a6de7b02c3d55adc67a9aeaad2d18c5f96cb12173fb6fe7fefd3aca6ffaba606932e392623f5acd1c56045536939f7ac74ea4 actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml index 5079bbd057..5d0ace9dd2 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml @@ -45,6 +45,7 @@ artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V3.0.0.zip" sha1: b590dbe3f4ae198ca500618a53491f75c221e98b + sha512: 540240d6e9827cb316b5dedc3667f9245f1effd9525da1e8b14a0700ceed7da683bb358bda5ee4c0e3457fe20260574d0485e4574ab357fed7bf598e4efe46de actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml index b60d29e65c..29e65fd2b0 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml @@ -22,7 +22,9 @@ licenses: artifacts: LICENSE-CC-BY-SA.txt: url: https://creativecommons.org/licenses/by-sa/4.0/legalcode.txt - sha1: 8f551a766d1f4556d1a2596365c0fc2191366efa + sha1: 7f893542ae74df4c277b98278ad9e3ad6c09e690 + sha512: 492cfa38f596c70aed7006ed695da45d15ae674d3e750e0791912f0f19c8814fab947535e19a8f9bf7ec20167a62554d5a1845b6612fc22970697eb39f0ca5f6 + verificationMode: TEXT LICENSE-HZSK-ACA.txt: text: | HZSK-ACA language resources can be accessed only for research purposes by ACAdemic @@ -32,6 +34,7 @@ artifacts: hamburgDepTreebank.tar.xz: url: "https://corpora.uni-hamburg.de:8443/fedora/objects/file:hdt_hdt-conll/datastreams/hdt-conll-tar-xz/content?asOfDateTime=2016-02-17T15:38:47.643Z&download=true" sha1: 6594e5cd48966db7dac04f2b5ff948eb2bcadf37 + sha512: 50c38068e63487845dfc98e3414bddfae3e6e463b8cdb97a91f30d64c37637893342ac5bc8af584749397039c00287c19eaa14262b7abe62b2ca7bd53b14bcd0 actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml index d3c1eb5535..8f02ff062d 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml @@ -32,9 +32,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT data.rar: - url: http://repositori.upf.edu/bitstream/handle/10230/20048/IULA_Spanish_LSP_Treebank.rar?sequence=1 + url: https://repositori.upf.edu/bitstream/handle/10230/20048/IULA_Spanish_LSP_Treebank.rar?sequence=1 sha1: 67e2ce3327501605b7c9f0844cc4982070612222 + sha512: a2d6786fb41701699b9dad1fe6ac2de93d212aac28492a2cc99e3116764e2236684926c4e9b1bacde937b6083c58381f08434505b21fdb739ff774b8e84d9f23 actions: - action: explode - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml index 34a9df038f..2b294d611a 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml @@ -27,9 +27,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-nc/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT data.zip: url: http://nl.ijs.si/jos/download/jos100kv2_0.zip sha1: 9f330ffd102cc5d5734fdaecbbf67751c84a1339 + sha512: 3358d37ef31ee7ac6b5dbd846de6a2c56396cb4856efa00d7731011a603894720a3d922a108d14c62dd504b5b4909d5ce0e0d7699f350c7007a3e08409ee4ce2 actions: - action: explode configuration: { strip: 1, includes: [ "00README.txt", "jos100kv2_0-sl.conll" ] } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml index 29d09db853..c19faa46bf 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml @@ -21,6 +21,7 @@ artifacts: data.zip: url: "http://www.anc.org/MASC/download/masc-conll.zip" sha1: d9f53a05c659204a3223e901c450fe8ffa5fa9fa + sha512: 67d9e67f8003153e9782a151d9c5ea8646d0c8604de13ae54c90416ed682171f6dfc745dc2a5ff1677e7f0517c94c6067cb6372a0b86fddb6e410b89d9af28cc actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml index 31ff9ea78b..ea155a9224 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml @@ -23,10 +23,13 @@ licenses: artifacts: LICENSE_NDT.txt: url: https://www.nb.no/sbfil/dok/LICENSE_NDT.txt - sha1: a2f433206f421c0d630b3bec5fad01334673b765 + sha1: ae02a3ca7e000d6cc98f07d3a8aa017f38900499 + sha512: bcd16abbb9b8604640488871432092825cd535bcf2561ada5f3807014e0d5433cdff8fd6f913d39d723497490bdb8da4329ad3da59beb2ba0634898965535942 + verificationMode: TEXT 20140328_NDT_1-01.tar.gz: url: https://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz sha1: 97935c225f98119aa94d53f37aa64762cba332f3 + sha512: ace37828398cb00677adf38ba2f4046a4bf21934c4abc326ba027251d599c595871a6488b6692b3ac968a0967bc9d727ef9aab71ef34abee87e805abb43bc2ab shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml index fb1cf2bea1..ad66a04826 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml @@ -23,10 +23,13 @@ licenses: artifacts: LICENSE_NDT.txt: url: http://www.nb.no/sbfil/dok/LICENSE_NDT.txt - sha1: a2f433206f421c0d630b3bec5fad01334673b765 + sha1: e907dbafba91ad8ed2a3412dc4e84df3c0a6ee58 + sha512: 5e0565dfadf1865fcf7e14a13a8d555812e5178c8e6b5639c2e2e508eac5066782a545e18adadaac0515fa6a3350c1492354373cd0b1e64a5498ab49d6237726 + verificationMode: TEXT 20140328_NDT_1-01.tar.gz: url: http://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz sha1: 97935c225f98119aa94d53f37aa64762cba332f3 + sha512: ace37828398cb00677adf38ba2f4046a4bf21934c4abc326ba027251d599c595871a6488b6692b3ac968a0967bc9d727ef9aab71ef34abee87e805abb43bc2ab shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml index 5f246c8acd..590c4933a9 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml @@ -21,10 +21,13 @@ licenses: artifacts: LICENSE.txt: url: https://creativecommons.org/licenses/by-sa/3.0/legalcode.txt - sha1: fb41626a3005c2b6e14b8b3f5d9d0b19b5faaa51 + sha1: fb6f31be27fed5efbcd4c2e1e64c50de470364b1 + sha512: ba59a7187a93fd7e0d4bcbf4f18076a341f8d4091d0ebc5d2b6f3ee7e8e3c79cd6c485640880def013e9116cba55c7ddc08890ff9859d0403f075393df45ea9f + verificationMode: TEXT data.zip: - url: "http://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip" + url: "https://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip" sha1: f2a1fd54df9232741a3a1892d1ffb0a4d7205991 + sha512: 128a2abc5c07b7483e626e65d05db9c4c80bb782e2bd7770b59e6748d6847ab3734ee97e00d1fe72e4346bc6aef0e489bd6efd3ca4e3b7e4824aef4e49704587 actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml index 7e1c7c0f4c..60b374035f 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml @@ -23,9 +23,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-sa/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT perseus.zip: url: "https://github.com/PerseusDL/treebank_data/archive/f56a35f65ef15ac454f6fbd2cfc6ea97bf2ca9b8.zip" sha1: 140eee6d2e3e83745f95d3d5274d9e965d898980 + sha512: b8fe14202b5dbe6d7c7b387f38a80036d62d3ecc860fa0fc1ee698ed10a8121b144c2c36b09b45fd6b4fb17a025f88e4669be66524b8a5b550c57032f789ceb4 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml index fe3a1020fd..461cce080e 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml @@ -23,9 +23,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-sa/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT perseus.zip: url: "https://github.com/PerseusDL/treebank_data/archive/f56a35f65ef15ac454f6fbd2cfc6ea97bf2ca9b8.zip" sha1: 140eee6d2e3e83745f95d3d5274d9e965d898980 + sha512: b8fe14202b5dbe6d7c7b387f38a80036d62d3ecc860fa0fc1ee698ed10a8121b144c2c36b09b45fd6b4fb17a025f88e4669be66524b8a5b550c57032f789ceb4 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml index c54243e850..bb9e9b0714 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml @@ -20,10 +20,13 @@ licenses: artifacts: LICENSE.txt: url: https://www.gnu.org/licenses/gpl-3.0.txt - sha1: 8624bcdae55baeef00cd11d5dfcfa60f68710a02 + sha1: 8b0cb355ed76e07cc7c876fec58341c2940cfee7 + sha512: b311f68b1c2dbf8f079f381d9854c185e1f6b64cb375ca96a5f67e25cb375d9f106875523e6ef7adbd8f1156ec572eb9f4ae8f04e6da6e6de35dd7938db354df + verificationMode: TEXT poldb-0.5.conll.gz: url: "http://zil.ipipan.waw.pl/Składnica?action=AttachFile&do=get&target=Składnica-zależnościowa-0.5.conll.gz" sha1: 187424608e91b271957dabcf140a7274f1c88d63 + sha512: d08dc44330d5084fa06409a6b76b99b90a201d8564c7dd2bd6435ee196898cc1787dfc93820842d38921086375c452ec057e5a95dfbd7a4ce48eacee8948df37 roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml index 260a1c3e21..1dd7223a5c 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml @@ -20,10 +20,13 @@ licenses: artifacts: LICENSE.txt: url: https://www.gnu.org/licenses/gpl-3.0.txt - sha1: 8624bcdae55baeef00cd11d5dfcfa60f68710a02 + sha1: 8b0cb355ed76e07cc7c876fec58341c2940cfee7 + sha512: b311f68b1c2dbf8f079f381d9854c185e1f6b64cb375ca96a5f67e25cb375d9f106875523e6ef7adbd8f1156ec572eb9f4ae8f04e6da6e6de35dd7938db354df + verificationMode: TEXT poltb-0.5-tiger.xml.gz: url: "http://zil.ipipan.waw.pl/Składnica?action=AttachFile&do=get&target=Składnica-frazowa-0.5-TigerXML.xml.gz" sha1: c8977d436d218b726d657224305bced178071dcf + sha512: 3da399b090dde90297a66cda7c5a6334bfee8bf16c9b6fb6d2af135f049ddbc57ca19cae4e382d26deba308c82f4bb970dc4a59e202a798665a6cbb49a23ee5d roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml index 3a203cfa96..5b8c02f636 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml @@ -30,6 +30,7 @@ artifacts: data.zip: url: http://nl.ijs.si/sdt/data/SDT-2006-01-06-CoNLL-X/data.zip sha1: 2bd85ad77c35d0c305a6afb7ee092676d5d22a35 + sha512: 022d4ffc2dbbe54b660fc6bedb2fe92b8a1b610749e6973dbc3798792ce82f875313acf6420d499111a2b17b9b7180cda48ffa81a665ea876e72a12aa473a73b actions: - action: explode configuration: { strip: 3 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml index 295b63328d..501e613aa2 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml @@ -30,10 +30,13 @@ licenses: artifacts: README.txt: url: http://nl.ijs.si/sdt/data/SDT-2006-05-17/00README.txt - sha1: d2ac8d9f8b45ceae34ce77f57b354662292bd609 + sha1: 9d047377eb96aa896461544cd1117b11b812809f + sha512: 2c11bf1b3c5960394dff57330b31020a51dae18bb0d91a7eac65ecfb4f6bb5eccb241b34036089bc58954b2f3752c9f0eab8672e8109b059c77a1c376679956b + verificationMode: TEXT sdt-conll.tbl: url: http://nl.ijs.si/sdt/data/SDT-2006-05-17/CONLL/sdt-conll.tbl sha1: 16cfa8a20ebf8ed0e4f13c0119c7aa76a2498b1f + sha512: 83359227235370ab16fe830d437bf4ca710d2b9a1e2885a8d0d44ec935aab2c187956a4ad0c5323eb7989c06c5f7326c843b1694a6de148de5c25cc8b0bd3958 roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml index 3b15e34fff..77944a535d 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml @@ -30,6 +30,7 @@ artifacts: sequoia.tgz: url: "http://talc2.loria.fr/deep-sequoia/sequoia-7.0.tgz" sha1: 9f53475f809ef1032a92adedf262226da1615051 + sha512: d6a90a7404caaf4c25ca48098b76fa2abcdbe88c45d1954548d76362b16d988cbbc4025e7cd7810fc7fec2141be8dda11ebc29eca15708d0e1e3e149ccc4d951 actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml index f3ad25971d..eb1972ed2f 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml @@ -20,9 +20,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-sa/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT setimes.hr.v1.conllx.gz: url: http://nlp.ffzg.hr/data/corpora/setimes.hr.v1.conllx.gz sha1: 0faebfe55136692f83dcddd4cf659a8b59655d62 + sha512: 81f4389172e6d340d7a8cf6581c86bc6213927ac4f25d0dd104e32c6ffb414f08ce2c3e6cdbde0bf6233acd7c3e8b9d475862d89dad762ba84d82d80b80d574f roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml index ad024eed1f..6142ce0c21 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml @@ -28,13 +28,18 @@ licenses: artifacts: LICENSE-CC-BY.txt: url: "https://creativecommons.org/licenses/by/4.0/legalcode.txt" - sha1: 563e2664fed2ce3e65bd1dd396422f46c5db9040 + sha1: 9c5bee7a22ab39ad6c19ab29ea9e94ac5874f9c6 + sha512: 222cf997169925ee3a09a26798d04332673693c578c24cb2d0cc550785a8b87849b161dccd9c48d2e4f3fa15290b6a10ac5262945f9c8cc6bdbd362d37416300 + verificationMode: TEXT LICENSE-CC-BY-NC-SA.txt: url: "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt" - sha1: 5d572362228001e9dbc0c8802f49121ceb78ace2 + sha1: 54cc324681563e5ede8088f020f0b21e35d37fb9 + sha512: 84b09f6057afa41c8e495697b67da30d6be0d00f04c4d7c244012f8003088d29f43f474905be1c9262d14f6e199130bbad64371818e32f60aa0311faa271e1ca + verificationMode: TEXT data.zip: url: https://github.com/ffnlp/sethr/archive/c50697a81ee588b70328952dd56175da4c298c7c.zip sha1: a52d13cfa91589c0d93fe0a90333a4f0e997b7cf + sha512: 394e06eee8a804fa7bfed2d0ccca152cbe1bf13478459c19212c3fd0bf33ed68ee292bf2528154581110c4fe49a2824661298e4caa19fe8e6b3ba6128427e40f actions: - action: explode configuration: { strip: 1, includes: [ "LICENSE.md", "README.md", "*.hr*.conll" ] } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml index 777c770757..d175541150 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml @@ -19,6 +19,7 @@ artifacts: egw4-reut.512.clusters: url: https://nlp.stanford.edu/software/egw4-reut.512.clusters sha1: 3f1352641a46e985c07d0023c0ada7e5be97e527 + sha512: 9feeb7de9dc49a278a7ec6c8fd02e582d0c154077ee656ae00eaa293a5366d6e74c2724223c38b9030eb30305ce3dd07ac4767f890814c4fa41f71c8c3b8c7f2 roles: data: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml index b2dac6bf86..82e4849e7e 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml @@ -29,8 +29,9 @@ licenses: artifacts: data.tar.gz: - url: http://stp.lingfil.uu.se/%7Enivre/research/Talbanken05_1.1.tar.gz + url: https://cl.lingfil.uu.se/~nivre/research/Talbanken05_1.1.tar.gz sha1: bc836ab364ba37522e2989481104bad2eb96a92e + sha512: 4a93609c6c674edd01f1254d4879f6c908865b48f640c1b813ac537a8a4e160f11fc75857f18d72d88de9b70d3cf8dd831acc97489b0f7c0c1d8c9058209b25c shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml index 67fe42740f..998c5aa736 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml @@ -29,8 +29,9 @@ licenses: artifacts: data.tar.gz: - url: http://stp.lingfil.uu.se/%7Enivre/research/Talbanken05_1.1.tar.gz + url: https://cl.lingfil.uu.se/~nivre/research/Talbanken05_1.1.tar.gz sha1: bc836ab364ba37522e2989481104bad2eb96a92e + sha512: 4a93609c6c674edd01f1254d4879f6c908865b48f640c1b813ac537a8a4e160f11fc75857f18d72d88de9b70d3cf8dd831acc97489b0f7c0c1d8c9058209b25c shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml index bc0c8caba6..617cd0d322 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml @@ -29,8 +29,9 @@ licenses: artifacts: data.tar.gz: - url: http://stp.lingfil.uu.se/%7Enivre/research/Talbanken05_1.1.tar.gz + url: https://cl.lingfil.uu.se/~nivre/research/Talbanken05_1.1.tar.gz sha1: bc836ab364ba37522e2989481104bad2eb96a92e + sha512: 4a93609c6c674edd01f1254d4879f6c908865b48f640c1b813ac537a8a4e160f11fc75857f18d72d88de9b70d3cf8dd831acc97489b0f7c0c1d8c9058209b25c shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml index a0de5bcac6..57fc0b255d 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml @@ -32,9 +32,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-nc-sa/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT data.tar.gz: - url: http://ahclab.naist.jp/resource/tedtreebank/naist-ntt-ted-treebank-v1.tar.gz + url: https://ahcweb01.naist.jp/resource/tedtreebank/naist-ntt-ted-treebank-v1.tar.gz sha1: 89c6495bd64c4b3e699b4c478b47a0c827ea46ea + sha512: a433d0dd1de9a04280f0115491e8d5414e6f5303fb271441cdd59f440eb5dd5c0f8cbbfd130f68dceb24323538ce2b3e0d8f0f77c61054338bf5816a7fd08b4b actions: - action: explode configuration: { strip: 1, includes: [ "README.md", "en-dep/*.dep" ] } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml index a8a3233f1a..3c4b658fe1 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml @@ -23,26 +23,31 @@ artifacts: NEWS.zip: url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/NEWS-22nov2010.conl.zip sha1: 3d9b22d8ebf533aa1d6d39d417316c30900b9a0e + sha512: 2922115acd622f290518a5863edad57cd8e57030660c83c9f79d76045b3da318ff7fcdbf40404be78f7548aaf977c2355be34e12ff0c85ea98096a298f2c8fd8 actions: - action: explode VEDCH.zip: url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/VEDCH-22nov2010.conl.zip sha1: 2278e6e770ddc4a8eea5e045c4a77a5df2ae0977 + sha512: 8aac15c988d266719df467fa1e07bf1771773f1e6b93ba8d44d991c9ad8fc0019ecfcaa2507bcfdf47c4f5f2b6e5137f59d025a4234fe06e5232af50abf1c18f actions: - action: explode CODICECIVILE.zip: url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/CODICECIVILE-22nov2010.conl.zip sha1: 9cf9c0a9c652b3df6564d1fa0ca97c2d7905faa3 + sha512: dd7507383c9f940df7d11975738198f4f5dd0174ddd25aa4ed2bba592e35c7758443b934c384063193d20e55fab7d58e155f6b48bfe23407fab4bce89a22a77e actions: - action: explode EUDIR.zip: url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/EUDIR-22nov2010.conl.zip sha1: 72a6e55627481ff99930b59714cfc0909ccf60e1 + sha512: b4502d9a0f4749e0a27b0e1bb4b8e0c51b8d71d55bceaa46124048d136957a1d99579add80c482009fa176ff89ec74f1db58d7b356920863c14adc0b47a47023 actions: - action: explode WIKI.zip: url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/WIKI-22nov2010.conl.zip sha1: a421f488859324e3e12687b9a3067652248eb8df + sha512: 5282e893b39a1f03c0b6a9a3afc31ecedee4f2e014aceb7bcf01b01dfdf14fbd586008747df3c68443c4843195c565a420912d9d65b910facf0063f2c6f26f87 actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml index 1e6e9ea353..5bebd70fb8 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml @@ -29,6 +29,7 @@ artifacts: data.tgz: url: "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1827/ud-treebanks-v1.4.tgz?sequence=4&isAllowed=y" sha1: 1c41c28b000935ffa6c63b9ff17c48e892c56597 + sha512: b41e297dc6befb8e7dfe1fe3281e796e4ee2fceff87187f1a7db2f75eb232705605e4ee2c282345db28b4f7970ac767fa6c572f1b5486c263ea94a814d360b38 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml index f7e4f743bf..bf34996b67 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml @@ -22,19 +22,24 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT train-conll.tar.gz: - url: "http://stp.lingfil.uu.se/~mojgan/train.conll.tar.gz" + url: "https://cl.lingfil.uu.se/~mojgan/train.conll.tar.gz" sha1: 6ace1d1132b121b09d0b88f53749d28a59843cd5 + sha512: 8fa53bdcc22a1f5173cb41d652e04a10daa27410162e68b4189be0bdaa04d5c24a88515a904cb639015efdc6c6bd78dea2d679897f2aa4785001256db7181d89 actions: - action: explode dev-conll.tar.gz: - url: "http://stp.lingfil.uu.se/~mojgan/dev.conll.tar.gz" + url: "https://cl.lingfil.uu.se/~mojgan/dev.conll.tar.gz" sha1: e96a06b399bb1f565e16e49fb4dfe7da241f5d75 + sha512: af5b6def61cad814c91b5da850c3f79f49f2f068ea43ea4f5c492b644f9380badf760f7d459866ceec82a849bce365c4497c48a5f3aefe553df99fd230921829 actions: - action: explode test-conll.tar.gz: - url: "http://stp.lingfil.uu.se/~mojgan/test.conll.tar.gz" + url: "https://cl.lingfil.uu.se/~mojgan/test.conll.tar.gz" sha1: ec79e91413dd2c49883bfbbd1a207f68377ac683 + sha512: 00ef70ba91dff176163210c0c4123157ea47383cc9682e0360ed2ef55bda3820ee7ce1b623c77ef354c6ce4fdbbc0059d466786b816c3c81d88cde845e5d9546 actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml index 31254a6153..ccbfec6043 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml @@ -6,7 +6,7 @@ mediaType: text/x.org.dkpro.conll-2009 encoding: UTF-8 name: English Word Sense and Semantic Role Datasets (WaSR) -url: https://www.ukp.tu-darmstadt.de/data/semantic-role-resources/knowledge-based-semantic-role-labeling/ +url: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_role_resources/knowledge_based_semantic_role_labeling/index.en.jsp attribution: | Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the @@ -25,9 +25,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-nc-nd/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT data.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-de_v1.tar.bz2" + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-de_v1.tar.bz2" sha1: b706711ae6fffc94409f80b635595bd45d8c2ece + sha512: ff2bc3becad49146873dc54644f299d1362f106258e0ed939e1c14058b02429372aa39d4027bf040845af6db20073c80a0813452cf103ffd0adf3d55eaea1704 actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml index ec9577c2bb..66078835a8 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml @@ -6,7 +6,7 @@ mediaType: text/x.org.dkpro.conll-2009 encoding: UTF-8 name: English Word Sense and Semantic Role Datasets (WaSR) -url: https://www.ukp.tu-darmstadt.de/data/semantic-role-resources/knowledge-based-semantic-role-labeling/ +url: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_role_resources/knowledge_based_semantic_role_labeling/index.en.jsp attribution: | Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the @@ -25,9 +25,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-nc-nd/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT part1.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part1.tar.bz2" + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part1.tar.bz2" sha1: ef7ccf5cb23da63003bdb19d99b15b0ea2821e55 + sha512: bc6ba46503596aae4005b32934b23be9bf12399222cb13569f77af3ce262bd84f9a3e86c8b74897a17493969361464a6ff9cd22620f37322241e24741415b480 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml index e1657a8e9f..e421dc337e 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml @@ -6,7 +6,7 @@ mediaType: text/x.org.dkpro.conll-2009 encoding: UTF-8 name: English Word Sense and Semantic Role Datasets (WaSR) -url: https://www.ukp.tu-darmstadt.de/data/semantic-role-resources/knowledge-based-semantic-role-labeling/ +url: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_role_resources/knowledge_based_semantic_role_labeling/index.en.jsp attribution: | Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the @@ -25,9 +25,12 @@ artifacts: LICENSE.txt: url: http://creativecommons.org/licenses/by-nc-nd/3.0/legalcode.txt sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + sha512: cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e + verificationMode: TEXT part1.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part1.tar.bz2" + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part1.tar.bz2" sha1: ef7ccf5cb23da63003bdb19d99b15b0ea2821e55 + sha512: bc6ba46503596aae4005b32934b23be9bf12399222cb13569f77af3ce262bd84f9a3e86c8b74897a17493969361464a6ff9cd22620f37322241e24741415b480 shared: true actions: - action: explode @@ -35,8 +38,9 @@ artifacts: - action: explode configuration: { file: "part1/WaSR_XL_part1_3.7z" } part2.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part2.tar.bz2" + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part2.tar.bz2" sha1: 0a9c98cbf1fe02841edf52e963444a7e38986577 + sha512: 4df84de5414322dad68ef23bca5e75336ff09c22c059a2f82320e8c5aca51fd93bb9b5f12d78f1127f0a518650d03898e504bd05209c1cf7da8b8403f1aa13d0 shared: true actions: - action: explode @@ -44,8 +48,9 @@ artifacts: - action: explode configuration: { file: "part2/WaSR_XL_part2_3.7z" } part3.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part3.tar.bz2" + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part3.tar.bz2" sha1: 9c0cc79ecab9140f82683d39ed6acb51b148f9f7 + sha512: f5c229a13e02fd602f0fadf68c1a6d70ccfa9f29db1ee79a485ab0707a6ee70ed4a5e5b78bbe30e9890565e94a83fecb1b716ed9e5d8635fe0b6428a13c1c33f shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java index 846d514c37..d1a141d930 100644 --- a/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java +++ b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java @@ -24,10 +24,8 @@ import java.io.File; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; -import org.dkpro.core.api.datasets.Dataset; -import org.dkpro.core.api.datasets.DatasetFactory; -import org.dkpro.core.api.datasets.Split; import org.dkpro.core.testing.DkproTestContext; import org.junit.Ignore; import org.junit.Rule; @@ -35,30 +33,27 @@ public class DatasetFactoryTest { - @Ignore("Used at times for offline testing / development") + //@Ignore("Used at times for offline testing / development") @Test public void testOne() throws Exception { - Path cache = testContext.getTestOutputFolder().toPath(); + //Path cache = testContext.getTestOutputFolder().toPath(); + Path cache = Paths.get("target/test-output/testLoadAll"); DatasetFactory df = new DatasetFactory(cache); { - Dataset ds = df.load("wasr-en-xl-1.00"); + Dataset ds = df.load("brown-en-teixml"); assertDatasetOk(ds); } -// { -// Dataset ds = df.load("ndt-nb-1.01"); -// assertDatasetOk(ds); -// } } - @Ignore("Used at times for offline testing / development") + //@Ignore("Used at times for offline testing / development") @Test public void testLoadAll() throws Exception { - Path cache = testContext.getTestOutputFolder().toPath(); + Path cache = Paths.get("target/test-output/testLoadAll"); DatasetFactory df = new DatasetFactory(cache); for (String id : df.listIds()) { diff --git a/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc b/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc index 79627fbbf7..345d5f83df 100644 --- a/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc +++ b/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc @@ -87,7 +87,7 @@ artifacts:: A list of artifacts that make of the dataset. The relevant artifacts be limited to the data files themselves, but could also include license texts or readme files if they are not part of a dataset archive. If a dataset is not distributed as an archive but rather as a set of files, each of the files should be listed here. - To describe an artifact, the **name**, **url**, and **sha1** checksum are required. + To describe an artifact, the **name**, **url**, and **sha512** checksum are required. The name of the artifact should correspond to the filename part of the URL from which the artifact is downloaded. However, sometimes it is convenient to use a simpler name, e.g. `data.zip`. However, the extension should always be preserved. This is particularly @@ -95,6 +95,9 @@ artifacts:: A list of artifacts that make of the dataset. The relevant artifacts [sect_datasets_actions] section below. + If an artifact contains multiple datasets, it can be **shared** to avoid downloading and caching it redundantly. See [sect_datasets_sharing] below. + + It is possible to set the **verificationMode** to **TEXT** in order to normalize whitespace + before calculating the checksum. This is recommended for license files or documentation but + not for actual data files (even if they are in a text format such as a CoNLL variant).+ + .Example artifacts section [source,yaml,indent=0] @@ -102,7 +105,7 @@ artifacts:: A list of artifacts that make of the dataset. The relevant artifacts artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V2.2.0.zip" - sha1: b17e276998ced83153be605d8157afacf1f10fdc + sha1: c9606ba69ec1152267b8...(snip)... actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } From a5c8aab4855b327d24af0c60f65aa087a80c7f9f Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 12 Jun 2019 14:54:51 +0200 Subject: [PATCH 2/4] #1384 - Update datasets API - Add GUM 5.0.0 UD dataset --- .../core/api/datasets/DatasetFactory.java | 14 +++-- .../datasets/lib/gum-ud-en-conll-5.0.0.yaml | 54 +++++++++++++++++++ .../core/api/datasets/DatasetFactoryTest.java | 6 +-- 3 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java index 494c9bddf9..bbc0b8c5b7 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java @@ -226,9 +226,17 @@ public Path resolve(DatasetDescription aDataset) private Path resolve(DatasetDescription aDataset, ArtifactDescription aArtifact) { if (aArtifact.isShared()) { - // Shared artifacts stored in a folder named by their SHA1 - return cacheRoot.resolve("shared").resolve(aArtifact.getSha1()) - .resolve(aArtifact.getName()); + // Shared artifacts stored in a folder named by their hash + // Prefere SHA1 for the time being to avoid users having to re-download too much as + // we slowly switch over to SHA512 + if (aArtifact.getSha1() != null) { + return cacheRoot.resolve("shared").resolve(aArtifact.getSha1()) + .resolve(aArtifact.getName()); + } + else { + return cacheRoot.resolve("shared").resolve(aArtifact.getSha512()) + .resolve(aArtifact.getName()); + } } else { // Unshared artifacts are stored in the dataset folder diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml new file mode 100644 index 0000000000..a38b79b5b8 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml @@ -0,0 +1,54 @@ +groupId: org.dkpro.core.datasets.gum +datasetId: gum-dep-ud +version: 5.0.0 +language: en +mediaType: text/x.org.dkpro.conll-u +encoding: UTF-8 + +name: Georgetown University Multilayer Corpus +url: https://corpling.uis.georgetown.edu/gum/ +attribution: | + Zeldes, Amir (2017) "The GUM Corpus: Creating Multilayer Resources in the Classroom". + Language Resources and Evaluation 51(3), 581–612. + For Gum annotation team, see https://corpling.uis.georgetown.edu/gum/ +description: | + GUM is an open source multilayer corpus of richly annotated web texts from eight text types. + The corpus is collected and expanded by students as part of the curriculum in LING-367 + Computational Corpus Linguistics at Georgetown University. The selection of text types is meant + to represent different communicative purposes, while coming from sources that are readily and + openly available (mostly Creative Commons licenses), so that new texts can be annotated and + published with ease. + + (This description has been sourced from the dataset website). + + The CPOS column of the files contains an extended POS tagset as it is used by the English + TreeTagger models. The POS column contains the regular PTB tagset. + +licenses: + - name: CC-BY 2.5 + url: http://creativecommons.org/licenses/by/2.5/ + comment: "Wikinews texts (Source: https://en.wikinews.org/wiki/Wikinews:Copyright)" + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + comment: "WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use)" + - name: CC-BY-NC-SA 3.0 + url: http://creativecommons.org/licenses/by-nc-sa/3.0/ + comment: "WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" + - name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + comment: "Annotations (Source: https://corpling.uis.georgetown.edu/gum/)" + +artifacts: + gum.zip: + url: "https://github.com/amir-zeldes/gum/archive/V5.0.0.zip" + sha512: fbf57b1c5400cad2185337bb8735391ca728583f9d49d40e95fd3e1449ef8160eb36efd400b901f9e33649b4133f9caff0c3f45de41be35adc33257c4e5192a7 + shared: true + actions: + - action: explode + configuration: { includes: ["dep/ud/*", "LICENSE.txt", "README.md"], strip: 1 } + +roles: + licenses: + - gum/LICENSE.txt + data: + - "**/dep/ud/*.conllu" \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java index d1a141d930..ac2d6016d1 100644 --- a/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java +++ b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java @@ -33,7 +33,7 @@ public class DatasetFactoryTest { - //@Ignore("Used at times for offline testing / development") + @Ignore("Used at times for offline testing / development") @Test public void testOne() throws Exception @@ -43,12 +43,12 @@ public void testOne() DatasetFactory df = new DatasetFactory(cache); { - Dataset ds = df.load("brown-en-teixml"); + Dataset ds = df.load("gum-ud-en-conll-5.0.0"); assertDatasetOk(ds); } } - //@Ignore("Used at times for offline testing / development") + @Ignore("Used at times for offline testing / development") @Test public void testLoadAll() throws Exception From 10de7f75140052c724be3398cdef6f01e0837ff6 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 12 Jun 2019 15:21:04 +0200 Subject: [PATCH 3/4] #1384 - Update datasets API - Update GUM 5.0.0 UD dataset license info --- .../api/datasets/lib/gum-ud-en-conll-5.0.0.yaml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml index a38b79b5b8..3d6a6b0cc9 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml @@ -27,16 +27,22 @@ description: | licenses: - name: CC-BY 2.5 url: http://creativecommons.org/licenses/by/2.5/ - comment: "Wikinews texts (Source: https://en.wikinews.org/wiki/Wikinews:Copyright)" + comment: "Wikinews/interviews texts (Source: https://en.wikinews.org/wiki/Wikinews:Copyright)" - name: CC-BY-SA 3.0 url: https://creativecommons.org/licenses/by-sa/3.0/ - comment: "WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use)" + comment: | + WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use); + Wikipedia biographies (Source: https://en.wikipedia.org/wiki/Wikipedia:Copyrights) - name: CC-BY-NC-SA 3.0 url: http://creativecommons.org/licenses/by-nc-sa/3.0/ - comment: "WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" + comment: | + WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons); + Fiction texts (Source: http://smallbeerpress.com/creative-commons/) - name: CC-BY 4.0 url: https://creativecommons.org/licenses/by/4.0/ - comment: "Annotations (Source: https://corpling.uis.georgetown.edu/gum/)" + comment: | + Annotations (Source: https://corpling.uis.georgetown.edu/gum/); + Academic texts (various sources, see LICENSE.txt file) artifacts: gum.zip: From 52876c8762ce4befffdd66d57550b5a6278f2eee Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 12 Jun 2019 15:21:37 +0200 Subject: [PATCH 4/4] #1384 - Update datasets API - Removed comment which shouldn't apply to GUM UD version --- .../org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml index 3d6a6b0cc9..e939a96acd 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml @@ -21,9 +21,6 @@ description: | (This description has been sourced from the dataset website). - The CPOS column of the files contains an extended POS tagset as it is used by the English - TreeTagger models. The POS column contains the regular PTB tagset. - licenses: - name: CC-BY 2.5 url: http://creativecommons.org/licenses/by/2.5/