forked from machine-learning-exchange/katalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
publaynet.yaml
83 lines (75 loc) · 3.01 KB
/
publaynet.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Copyright 2021 The MLX Contributors
#
# SPDX-License-Identifier: Apache-2.0
id: publaynet
name: PubLayNet
description: PubLayNet is a large dataset of document images from PubMed Central Open Access Subset. Each document’s layout is annotated with both bounding boxes and polygonal segmentations.
version: 1.0.0
created: 2019-08-07
updated: 2019-08-07
format:
- type: JPG
url: https://en.wikipedia.org/wiki/JPEG
- type: JSON
url: https://json.org/
domain: Computer Vision
# Information about the entity that makes the data set available
provider:
name: Data Asset eXchange
url: https://developer.ibm.com/exchanges/data/all/publaynet/
# identifies where the data set is stored and how it is stored (REQUIRED)
repository:
type: HTTP
url: https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz
mime_type: application/x-tar
sha_512: 087a58aeed533f953c041dc03e5f14bc0b1bf53c44fa1f5816fde7c0376174b1b67c837d93094f96fd6313aba172c914c7bc2d14c6f84b58913c0ddd550264eb
size: 96G
# REQUIRED; data set license information
license:
commercial: false
name: CDLA-Permissive
url: https://cdla.io/permissive-1-0/
# REQUIRED; describes relevant files in the data set archive
content:
- pattern: train/*
description: Images in the training subset
type: regex
- pattern: val/*
description: Images in the validation subset
type: regex
- pattern: test/*
description: Images in the testing subset
type: regex
- pattern: test.json
description: Annotations for training images
type: file
- pattern: val.json
description: Annotations for validation images
type: file
# OPTIONAL; Identifies where the data set was obtained from
source:
name: Images of research papers from PubMed and annotations from IBM Research Australia.
url: https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/
authors:
- name: Xu Zhong
url: https://researcher.watson.ibm.com/researcher/view.php?person=au1-peter.zhong
- name: Jianbin Tang
url: https://researcher.watson.ibm.com/researcher/view.php?person=ibm-Elaheh.Shafieibavani
- name: Antonio Jimeno Yepes
url: https://researcher.watson.ibm.com/researcher/view.php?person=au1-antonio.jimeno
# OPTIONAL; but recommended
seo_tags:
- Document Layout Analysis
- Text
# OPTIONAL; assets that complement this data set, e.g. notebooks
related_assets:
- name: Publaynet Watson Studio project
description: Watson Studio Gallery project for the publaynet data set
mime_type: text/html
url: https://dataplatform.cloud.ibm.com/analytics/notebooks/v2/43cb95d9-6c3e-479c-a189-8c9ff3524ec1/view?access_token=bb8ce645cf114b5f5512ae2eb9c7badcf0927f313e8f76b8138d0701289484e6
- name: "Image-based table recognition: data, model, and evaluation"
description: Research paper
mime_type: text/html
url: https://arxiv.org/abs/1911.10683
# OPTIONAL; url for the markdown file which describes the asset
readme_url: https://raw.githubusercontent.com/machine-learning-exchange/katalog/main/dataset-samples/publaynet/publaynet.md