From 191cb607dd2f19709c6077aba7941ac14b27d402 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 15 Aug 2023 14:03:59 -0700
Subject: [PATCH 1/4] add return...

---
 .../e5910c027af0ee9c1901c57f6579d903aedee7f4.xml               | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.xml b/tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.xml
index 3d8cb808..3c453adf 100644
--- a/tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.xml
+++ b/tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.xml
@@ -88,7 +88,8 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.co
 <div xmlns="http://www.tei-c.org/ns/1.0"><head n="6." coords="7,50.11,253.24,210.04,10.75">G-pooling and state-of-the-art methods</head><p><s coords="7,62.07,273.68,224.30,8.96;7,50.11,285.95,236.25,8.64;7,50.11,297.91,236.25,8.64;7,50.11,309.54,113.26,8.96">In order to verify that our proposed G-pooling is able to improve state-of-the-art segmentation approaches, we select DeepLab <ref type="bibr" coords="7,107.01,297.91,11.62,8.64" target="#b5">[6]</ref> and SegNet <ref type="bibr" coords="7,171.19,297.91,11.62,8.64" target="#b2">[3]</ref> as additional network architectures to test G-pooling.</s><s coords="7,166.27,309.86,120.09,8.64;7,50.11,321.82,236.25,8.64;7,50.11,333.77,105.24,8.64">As mentioned above, the models in Section 5 use FCN as the network architecture and VGG-16 as the backbone.</s><s coords="7,160.36,333.77,126.01,8.64;7,50.11,345.73,236.25,8.64;7,50.11,357.68,16.88,8.64">For fair comparison with FCN, VGG-16 is also used as the backbone in DeepLab and Seg-Net.</s></p><p><s coords="7,62.07,369.70,224.30,8.64;7,50.11,381.66,49.49,8.64">DeepLab <ref type="bibr" coords="7,100.96,369.70,11.62,8.64" target="#b5">[6]</ref> uses a large receptive fields through dilated convolution.</s><s coords="7,107.30,381.48,179.07,8.82;7,50.11,393.44,236.25,8.82;7,50.11,405.57,236.25,8.64;7,50.11,417.21,143.64,8.96">For the baseline DeepLab itself, pool4 and pool5 from the backbone VGG-16 are removed and followed by <ref type="bibr" coords="7,88.11,405.57,16.60,8.64" target="#b30">[32]</ref> and the dilated conv layers with a dilation rate of 2 are replaced with conv5 layers.</s><s coords="7,196.95,417.21,89.42,8.96;7,50.11,429.16,236.25,8.96;7,50.11,441.26,25.19,8.59">For the G-pooling version, pool1,pool2 are replaced with G-pooling and we keep pool3.</s><s coords="7,78.71,441.44,207.65,8.64;7,50.11,453.07,236.25,8.96;7,50.11,465.35,87.33,8.64">Thus there are three max pooling layers in the baseline and one G-pooling layer and one max pooling layer in our proposed version.</s><s coords="7,141.86,465.35,144.51,8.64;7,50.11,477.30,236.25,8.64;7,50.11,489.26,71.86,8.64">SegNet uses an encoder-decoder architecture and preserves the max pooling index for unpooling in the decoder.</s><s coords="7,124.86,489.26,161.50,8.64;7,50.11,501.03,236.25,8.82;7,50.11,512.85,236.25,8.96;7,50.11,524.80,167.99,8.96">Similar to Deeplab, there are 5 max pooling layers in total in the encoder of SegNet so pool1,pool2 are replaced with the proposed G pool1 and pool3,pool4 are replaced with G pool2, and pool5 is kept.</s><s coords="7,222.64,525.12,63.72,8.64;7,50.11,536.76,236.25,8.96;7,50.11,548.71,236.25,8.96;7,50.11,560.67,236.25,8.96;7,50.11,572.94,31.67,8.64">This leads us to use a 4 × 4 unpooling window to recover the spatial resolution where the original ones are just 2 × 2. Thus there are two G-pooling and one max pooling layers in our SegNet version.</s></p><p><s coords="7,62.07,584.64,224.30,8.96;7,50.11,596.60,236.25,8.96;7,50.11,608.87,33.21,8.64">As can be seen in Table <ref type="table" coords="7,181.48,584.96,3.74,8.64" target="#tab_3">4</ref>, G-pooling is able to improve the model accuracy for Potsdam, 67.97% → 68.33%.</s><s coords="7,89.88,608.87,196.48,8.64;7,50.11,620.51,236.25,8.96;7,50.11,632.78,135.38,8.64">And the improvement on the generalization test Potsdam→Vaihingen is even more obvious, G-pooling improves mIoU from 38.57 to 40.04.</s><s coords="7,188.51,632.78,97.85,8.64;7,50.11,644.74,122.18,8.64">Similar observations can be made for SegNet and FCN.</s><s coords="7,175.12,644.74,111.24,8.64;7,50.11,656.69,236.25,8.64;7,50.11,668.65,67.30,8.64">For Vaihingen, even though the model accuracy is not as high as the baseline, the difference is small.</s><s coords="7,123.91,668.65,162.45,8.64;7,50.11,680.60,160.65,8.64">The mIoU of our versions of DeepLab, SegNet and FCN is less than 1% lower.</s><s coords="7,215.50,680.60,70.86,8.64;7,50.11,692.56,236.25,8.64;7,50.11,704.51,236.25,8.64;7,308.86,75.48,40.12,8.64">We note that Vaihingen is an easier dataset than Potsdam, since it only includes urban scenes while Potsdam includes both urban and nonurban.</s><s coords="7,351.96,75.48,193.16,8.64;7,308.86,87.11,101.23,8.96">However, the generalizability of our model using G-pooling is much better.</s><s coords="7,413.25,87.43,131.87,8.64;7,308.86,99.07,236.25,8.96;7,308.86,111.34,236.25,8.64;7,308.86,123.30,61.29,8.64">As shown, when testing Potsdam using a model trained on Vaihingen, FCN with G-pooling is able to achieve 23.02% mIoU which is an improvement of 7.54% IoU.</s><s coords="7,374.78,123.30,170.33,8.64;7,308.86,135.25,87.55,8.64">The same observations can be made for DeepLab and SegNet.</s></p></div>
 <div xmlns="http://www.tei-c.org/ns/1.0"><head n="7." coords="7,308.86,442.29,65.77,10.75">Discussion</head><p><s coords="7,320.82,465.41,224.30,8.64;7,308.86,477.37,66.81,8.64">Incorporating knowledge is not a novel approach for neural networks.</s><s coords="7,380.28,477.37,164.83,8.64;7,308.86,489.32,236.25,8.64;7,308.86,501.28,213.82,8.64">Before deep learning, there was work on rule-based neural networks which required expert knowledge to design the network for specific applications.</s><s coords="7,528.52,501.28,16.60,8.64;7,308.86,513.23,236.25,8.64;7,308.86,525.19,236.25,8.64">Due to the large capacity of deep models, deep learning has become the primary approach to address vision problems.</s><s coords="7,308.86,537.14,236.25,8.64;7,308.86,549.10,207.07,8.64">However, deep learning is a data-driven approach which relies significantly on the amount of training data.</s><s coords="7,522.61,549.10,22.51,8.64;7,308.86,561.05,236.25,8.64;7,308.86,573.01,105.04,8.64">If the model is trained with a large amount of data then it will have good generalization.</s><s coords="7,421.13,573.01,123.98,8.64;7,308.86,584.96,236.25,8.64;7,308.86,596.92,186.77,8.64">But the case is often, particularly in overhead image segmentation, that the dataset is not large enough like it is in ImageNet/Cityscapes.</s><s coords="7,498.86,596.92,46.26,8.64;7,308.86,608.87,43.71,8.64">This causes overfitting.</s><s coords="7,357.67,608.87,187.45,8.64;7,308.86,620.83,80.91,8.64">Early stopping, cross-validation, etc. can help to avoid overfitting.</s><s coords="7,396.95,620.83,148.16,8.64;7,308.86,632.78,236.25,8.64;7,308.86,644.74,19.65,8.64">Still, if domain shift exists between the training and test sets, the deep models do not perform well.</s><s coords="7,334.29,644.74,210.83,8.64;7,308.86,656.69,126.94,8.64">In this work, we propose a knowledge-incorporated approach to reduce overfitting.</s><s coords="7,443.85,656.69,101.27,8.64;7,308.86,668.65,236.25,8.64;7,308.86,680.60,236.25,8.64;7,308.86,692.56,82.74,8.64">We address the question of how to incorporate the knowledge directly into the deep models by proposing a novel pooling method for overhead image segmentation.</s><s coords="7,395.99,692.56,149.12,8.64;7,308.86,704.51,42.92,8.64">But some issues still need discussing as follows.</s><s coords="8,50.11,410.98,236.25,9.03;8,50.11,423.00,167.13,8.96;8,217.24,421.43,4.08,6.12;8,217.24,427.64,2.82,6.12;8,225.87,423.32,60.49,8.64;8,50.11,435.28,196.57,8.64">Scenarios using G-pooling As mentioned in section 3, Gpooling is developed using Getis-Ord G * i analysis which quantifies how the spatial convergence occurs.</s><s coords="8,257.08,435.28,29.29,8.64;8,50.11,447.23,236.25,8.64;8,50.11,459.19,22.97,8.64">This is a simulated process design for geospatial data downsampling.</s><s coords="8,76.72,459.19,209.64,8.64;8,50.11,471.14,34.03,8.64">Thus it's not necessarily appropriate for other image datasets.</s><s coords="8,87.09,471.14,199.27,8.64;8,50.11,483.10,45.96,8.64">This is more general restriction of incorporating of knowledge.</s><s coords="8,100.75,482.78,69.22,8.96;8,169.97,481.20,4.08,6.12;8,169.97,487.41,2.82,6.12;8,177.57,483.10,108.79,8.64;8,50.11,495.05,138.54,8.64">The Getis-Ord G * i provides a method to identify spatial clusters while training.</s><s coords="8,193.74,495.05,92.62,8.64;8,50.11,507.01,236.25,8.64;8,50.11,518.96,171.80,8.64">The effect is similar to conditional random fields/Markov random fields in standard computer vision post-processing methods.</s><s coords="8,228.57,518.96,57.79,8.64;8,50.11,530.92,236.25,8.64;8,50.11,542.87,236.25,8.64;8,50.11,554.83,236.25,8.64;8,50.11,566.78,74.72,8.64">However, it is different from them since the spatial clustering is dynamically changing based on the feature maps and the geospatial location while post-processing methods rely on the prediction of the models.</s></p></div>
 <div xmlns="http://www.tei-c.org/ns/1.0"><head coords="8,50.11,596.53,101.87,8.96">Local geospatial pattern</head><p><s coords="8,161.94,596.60,124.42,8.96;8,50.11,608.87,128.63,8.64">We now explain how G-pooling works in deep neural networks.</s><s coords="8,184.86,608.55,51.18,8.96;8,236.04,606.98,4.08,6.12;8,236.04,613.19,2.82,6.12;8,244.12,608.87,42.24,8.64;8,50.11,620.83,236.25,8.64;8,50.11,632.78,179.41,8.64">Getis-Ord G * i analysis is usually used to analyze a global region hotspot detection which describes the geospatial convergence.</s><s coords="8,235.13,632.78,51.24,8.64;8,50.11,644.42,236.25,8.96;8,50.11,656.69,65.89,8.64">As shown in Figure <ref type="figure" coords="8,79.69,644.74,3.74,8.64" target="#fig_1">3</ref>, G-pooling will be applied twice to downsample the feature map.</s><s coords="8,121.25,656.37,165.12,8.96;8,50.11,668.33,130.84,8.96">The spatial size of the G-pooling will be 64 × 64 and 16 × 16 respectively.</s><s coords="8,183.95,668.65,102.42,8.64;8,50.11,680.29,236.25,8.96;8,50.11,692.24,88.14,8.96">And the max-pooling will lead to the size of feature map being reduced by 1/2 while ours it will be by 1/4.</s><s coords="8,141.76,692.56,144.61,8.64;8,50.11,704.20,7.83,8.74;8,57.95,702.62,4.08,6.12;8,57.95,708.83,2.82,6.12;8,65.02,704.51,80.15,8.64">This is because we want to compute G * i over a larger region.</s></p><p><s coords="8,320.82,411.05,63.22,8.96;8,384.03,409.47,4.08,6.12;8,384.03,415.68,2.82,6.12;8,392.31,411.37,152.81,8.64;8,308.86,423.32,236.25,8.64;8,308.86,435.28,144.43,8.64">Even though G * i is usually computed over a larger region than in our framework, it still provides captures spatial convergence within a small region.</s><s coords="8,460.56,434.96,84.55,8.96;8,308.86,447.23,236.25,8.64;8,308.86,459.19,236.25,8.64">Also, two G-pooling operations are applied at different scales of feature map and so a larger region in the input image is really considered.</s><s coords="8,308.86,470.82,236.25,8.96;8,308.86,482.78,236.25,8.96;8,308.86,494.73,34.09,8.74">Specifically, the first 4 × 4 pooling window is slid over the 256 × 256 feature map and the output feature map has size 64 × 64.</s><s coords="8,346.02,495.05,199.09,8.64;8,308.86,506.69,104.33,8.96">This is fed through the next conv layers and a second G-pooling is applied.</s><s coords="8,419.46,507.01,125.65,8.64;8,308.86,518.64,236.25,8.96;8,308.86,530.60,236.25,8.96;8,308.86,542.87,79.13,8.64">At this stage, the input feature map is 64 × 64 and so when a 4 × 4 sliding window is now used, a region of 16 × 16 is really considered, which is 1/16 of the whole image.</s></p><p><s coords="8,308.86,573.98,218.87,9.03">Limitations There are some limitations of our work.</s><s coords="8,531.43,574.37,13.69,8.64;8,308.86,586.33,236.25,8.64;8,308.86,597.96,98.81,8.96;8,407.67,596.39,4.08,6.12;8,407.67,602.60,2.82,6.12;8,415.41,598.28,34.59,8.64">For example, we didn't investigate the optimal window size for performing Getis-Ord G * i analysis.</s><s coords="8,455.09,598.28,90.02,8.64;8,308.86,610.24,147.21,8.64">We also only consider one kind of spatial pattern, clusters.</s><s coords="8,462.17,610.24,82.95,8.64;8,308.86,622.19,236.25,8.64;8,308.86,634.15,53.39,8.64">And, there might be better places than pooling to incorporate knowledge in CNN architectures.</s></p></div>
-<div xmlns="http://www.tei-c.org/ns/1.0"><head n="8." coords="8,308.86,659.01,69.09,10.75">Conclusion</head><p><s coords="8,320.82,680.60,224.30,8.64;8,308.86,692.56,236.25,8.64;8,308.86,704.51,51.10,8.64">In this paper, we investigate how geospatial knowledge can be incorporated into deep learning for geospatial image analysis.</s><s coords="8,363.63,704.51,181.49,8.64;9,50.11,75.48,114.99,8.64">We demonstrate that incorporating geospatial rules improves performance.</s><s coords="9,169.74,75.48,116.62,8.64;9,50.11,87.43,236.25,8.64;9,50.11,99.39,15.22,8.64">We realize, though, that ours is just preliminary work into geospatial guided deep learning.</s><s coords="9,72.71,99.39,213.66,8.64;9,50.11,111.34,236.25,8.64;9,50.11,123.30,215.68,8.64">We note the limitations of our approach, for example, that the prior distribution does not provide benefits for classes in which this prior knowledge is not relevant.</s><s coords="9,270.87,123.30,15.49,8.64;9,50.11,135.25,236.25,8.64;9,50.11,147.21,202.88,8.64">Our proposed approach does not show much improvement on the single dataset case especially a small dataset.</s><s coords="9,259.78,147.21,26.58,8.64;9,50.11,159.16,236.25,8.64;9,50.11,170.80,160.56,8.96">ISPRS Vaihingen is a very small dataset which contains around only 500 images of size of 256 × 256.</s><s coords="9,217.01,171.12,69.35,8.64;9,50.11,183.07,236.25,8.64;9,50.11,195.03,186.76,8.64">In the future, we will explore other ways to encode geographic rules so they can be incorporated into deep learning models.</s></p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0" coords="3,50.11,256.55,236.25,8.64;3,50.11,268.18,236.25,8.96;3,50.11,280.46,236.25,8.64;3,50.11,292.41,236.25,8.64;3,50.11,304.37,236.25,8.64;3,50.11,316.32,236.25,8.64;3,50.11,327.96,236.25,8.96;3,50.11,340.23,236.25,8.64;3,50.11,352.19,125.12,8.64;3,50.11,72.00,236.24,172.70"><head>Figure 2 :</head><label>2</label><figDesc><div><p><s coords="3,50.11,256.55,236.25,8.64;3,50.11,268.18,236.25,8.96;3,50.11,280.46,236.25,8.64;3,50.11,292.41,131.49,8.64">Figure 2: Given a feature map as an input, max pooling (top right) and the proposed G-pooling (bottom right) create different output downsampled feature map based on the characteristics of spatial cluster.</s><s coords="3,188.90,292.41,97.46,8.64;3,50.11,304.37,236.25,8.64;3,50.11,316.32,12.45,8.64">The feature map within the sliding window (blue dot line) indicates a spatial cluster.</s><s coords="3,68.13,316.32,218.23,8.64;3,50.11,327.96,236.25,8.96;3,50.11,340.23,76.94,8.64">Max pooling takes the max value ignoring the spatial cluster, while our G-pooling takes the interpolated value at the center location.</s><s coords="3,131.42,340.23,154.95,8.64;3,50.11,352.19,125.12,8.64">(White, gray and black represent three values range from low to high.)</s></p></div></figDesc><graphic coords="3,50.11,72.00,236.24,172.70" type="bitmap" /></figure>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="8." coords="8,308.86,659.01,69.09,10.75">Conclusion</head><p><s coords="8,320.82,680.60,224.30,8.64;8,308.86,692.56,236.25,8.64;8,308.86,704.51,51.10,8.64">In this paper, we investigate how geospatial knowledge can be incorporated into deep learning for geospatial image analysis.</s><s coords="8,363.63,704.51,181.49,8.64;9,50.11,75.48,114.99,8.64">We demonstrate that incorporating geospatial rules improves performance.</s><s coords="9,169.74,75.48,116.62,8.64;9,50.11,87.43,236.25,8.64;9,50.11,99.39,15.22,8.64">We realize, though, that ours is just preliminary work into geospatial guided deep learning.</s><s coords="9,72.71,99.39,213.66,8.64;9,50.11,111.34,236.25,8.64;9,50.11,123.30,215.68,8.64">We note the limitations of our approach, for example, that the prior distribution does not provide benefits for classes in which this prior knowledge is not relevant.</s><s coords="9,270.87,123.30,15.49,8.64;9,50.11,135.25,236.25,8.64;9,50.11,147.21,202.88,8.64">Our proposed approach does not show much improvement on the single dataset case especially a small dataset.</s><s coords="9,259.78,147.21,26.58,8.64;9,50.11,159.16,236.25,8.64;9,50.11,170.80,160.56,8.96">ISPRS Vaihingen is a very small dataset which contains around only 500 images of size of 256 × 256.</s><s coords="9,217.01,171.12,69.35,8.64;9,50.11,183.07,236.25,8.64;9,50.11,195.03,186.76,8.64">In the future, we will explore other ways to encode geographic rules so they can be incorporated into deep learning models.</s></p></div>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0" coords="3,50.11,256.55,236.25,8.64;3,50.11,268.18,236.25,8.96;3,50.11,280.46,236.25,8.64;3,50.11,292.41,236.25,8.64;3,50.11,304.37,236.25,8.64;3,50.11,316.32,236.25,8.64;3,50.11,327.96,236.25,8.96;3,50.11,340.23,236.25,8.64;3,50.11,352.19,125.12,8.64;3,50.11,72.00,236.24,172.70"><head>Figure 2 :</head><label>2</label><figDesc><div><p><s coords="3,50.11,256.55,236.25,8.64;3,50.11,268.18,236.25,8.96;3,50.11,280.46,236.25,8.64;3,50.11,292.41,131.49,8.64">Figure 2: Given a feature map as an input, max pooling (top right) and the proposed G-pooling (bottom right) create different output downsampled feature map based on the characteristics of spatial cluster.</s><s coords="3,188.90,292.41,97.46,8.64;3,50.11,304.37,236.25,8.64;3,50.11,316.32,12.45,8.64">The feature map within the sliding window (blue dot line) indicates a spatial cluster.</s><s coords="3,68.13,316.32,218.23,8.64;3,50.11,327.96,236.25,8.96;3,50.11,340.23,76.94,8.64">Max pooling takes the max value ignoring the spatial cluster, while our G-pooling takes the interpolated value at the center location.</s><s coords="3,131.42,340.23,154.95,8.64;3,50.11,352.19,125.12,8.64">(White, gray and black represent three values range from low to high.)</s></p></div></figDesc><graphic coords="3,50.11,72.00,236.24,172.70" type="bitmap" /></figure>
 <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1" coords="4,59.03,184.06,218.42,8.96;4,56.02,72.00,224.43,100.54"><head>Figure 3 :</head><label>3</label><figDesc><div><p><s coords="4,59.03,184.06,218.42,8.96">Figure 3: A FCN network architecture with G-pooling.</s></p></div></figDesc><graphic coords="4,56.02,72.00,224.43,100.54" type="bitmap" /></figure>
 <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2" coords="8,50.11,365.94,495.00,8.64;8,50.11,377.89,67.00,8.64;8,50.11,72.00,495.03,282.09"><head>Figure 4 :</head><label>4</label><figDesc><div><p><s coords="8,50.11,365.94,196.08,8.64">Figure 4: Qualitative results of ISPRS Potsdam.</s><s coords="8,251.51,365.94,293.61,8.64;8,50.11,377.89,67.00,8.64">White: road, blue: building, cyan: low vegetation, green: trees, yellow: cars, red: clutter.</s></p></div></figDesc><graphic coords="8,50.11,72.00,495.03,282.09" type="bitmap" /></figure>
 <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0" coords="5,50.11,73.56,495.00,253.90"><head>Table 1 :</head><label>1</label><figDesc><div><p><s coords="5,86.82,73.88,233.55,8.64">Experimental results of FCN using VGG-16 as backbone.</s><s coords="5,324.96,73.56,220.15,8.96;5,50.11,85.84,173.12,8.64">Stride conv, P-pooling and ours G-pooling are used to replaced the standard max/average pooling.</s></p></div></figDesc><table coords="5,108.49,111.34,378.24,216.12"><row><cell>Potsdam</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell>Methods</cell><cell cols="5">Roads Buildings Low Veg. Trees Cars mIoU Pixel Acc.</cell></row><row><cell>Max</cell><cell>70.62</cell><cell>74.28</cell><cell>65.94</cell><cell>61.36 61.40 66.72</cell><cell>79.55</cell></row><row><cell>Average</cell><cell>69.34</cell><cell>74.49</cell><cell>63.94</cell><cell>60.06 60.28 65.62</cell><cell>78.08</cell></row><row><cell>Stride</cell><cell>67.22</cell><cell>73.97</cell><cell>63.01</cell><cell>60.09 59.39 64.74</cell><cell>77.54</cell></row><row><cell>P-pooling</cell><cell>71.97</cell><cell>75.55</cell><cell>66.80</cell><cell>62.03 62.39 67.75</cell><cell>81.02</cell></row><row><cell cols="2">G-pooling-1.0 (ours) 68.59</cell><cell>77.39</cell><cell>67.48</cell><cell>55.56 62.18 66.24</cell><cell>79.43</cell></row><row><cell cols="2">G-pooling-1.5 (ours) 70.06</cell><cell>76.12</cell><cell>67.67</cell><cell>62.12 63.91 67.98</cell><cell>81.63</cell></row><row><cell cols="2">G-pooling-2.0 (ours) 70.99</cell><cell>74.89</cell><cell>65.34</cell><cell>61.57 60.77 66.71</cell><cell>79.46</cell></row><row><cell>Vaihingen</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell>Max</cell><cell>70.63</cell><cell>80.42</cell><cell>51.57</cell><cell>70.12 55.32 65.61</cell><cell>81.88</cell></row><row><cell>Average</cell><cell>70.54</cell><cell>79.86</cell><cell>50.49</cell><cell>69.18 54.83 64.98</cell><cell>79.98</cell></row><row><cell>Strde conv</cell><cell>68.36</cell><cell>77.65</cell><cell>49.21</cell><cell>67.34 53.29 63.17</cell><cell>79.44</cell></row><row><cell>P-pooling</cell><cell>71.06</cell><cell>80.52</cell><cell>51.70</cell><cell>70.93 53.65 65.57</cell><cell>82.44</cell></row><row><cell cols="2">G-pooling-1.0 (ours) 72.15</cell><cell>79.69</cell><cell>53.28</cell><cell>70.89 53.72 65.95</cell><cell>81.78</cell></row><row><cell cols="2">G-pooling-1.5 (ours) 71.61</cell><cell>78.74</cell><cell>48.18</cell><cell>68.53 55.64 64.54</cell><cell>80.42</cell></row><row><cell cols="2">G-pooling-2.0 (ours) 71.09</cell><cell>78.88</cell><cell>50.62</cell><cell>68.32 54.01 64.58</cell><cell>80.75</cell></row></table></figure>

From 63ede452b78a0bef7d6686a6541c2763309e4f4e Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 15 Aug 2023 14:17:15 -0700
Subject: [PATCH 2/4] body sections paragraphs sentences and test

---
 ...grobid_augment_existing_document_parser.py | 77 +++++++++++++++++++
 ...grobid_augment_existing_document_parser.py | 23 ++++++
 2 files changed, 100 insertions(+)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 3f73284d..452ee5f3 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -99,6 +99,34 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
             )
         )
 
+        # sections
+        # Grobid provides coordinates and number attributes for section headers, and coordinates for
+        # sentences within the body text, also tagged by paragraphs.
+        # We use these to annotate the document in order to provide a hierarchical structure:
+        # e.g. doc.sections.header, doc.sections[0].paragraphs[0].sentences[0]
+        section_box_groups, heading_box_groups, paragraph_box_groups, sentence_box_groups = \
+            self._get_structured_body_text_box_groups(xml_root)
+        doc.annotate(
+            sections=box_groups_to_span_groups(
+                section_box_groups, doc, center=True
+            )
+        )
+        doc.annotate(
+            headings=box_groups_to_span_groups(
+                heading_box_groups, doc, center=False
+            )
+        )
+        doc.annotate(
+            paragraphs=box_groups_to_span_groups(
+                paragraph_box_groups, doc, center=True
+            )
+        )
+        doc.annotate(
+            sentences=box_groups_to_span_groups(
+                sentence_box_groups, doc, center=True
+            )
+        )
+
         return doc
 
     def _xml_coords_to_boxes(self, coords_attribute: str):
@@ -172,3 +200,52 @@ def _get_box_groups(
             else:
                 box_groups.append(BoxGroup(boxes=boxes))
         return box_groups
+
+    def _get_heading_box_group(
+            self,
+            section_div: et.Element
+    ) -> Optional[BoxGroup]:
+        box_group = None
+        heading_element = section_div.find(f".//tei:head", NS)
+        if heading_element is not None:  # elements evaluate as False if no children
+            coords_string = heading_element.attrib["coords"]
+            boxes = self._xml_coords_to_boxes(coords_string)
+            number = heading_element.attrib["n"] if "n" in heading_element.keys() else None
+            section_title = heading_element.text
+            box_group = BoxGroup(
+                boxes=boxes,
+                metadata=Metadata(number=number, title=section_title),
+            )
+        return box_group
+
+    def _get_structured_body_text_box_groups(
+            self,
+            root: et.Element
+    ) -> (List[BoxGroup], List[BoxGroup], List[BoxGroup], List[BoxGroup]):
+        section_list_root = root.find(f".//tei:body", NS)
+
+        body_sections: List[BoxGroup] = []
+        body_headings: List[BoxGroup] = []
+        body_paragraphs: List[BoxGroup] = []
+        body_sentences: List[BoxGroup] = []
+
+        section_divs = section_list_root.findall(f"./tei:div", NS)
+        for div in section_divs:
+            section_boxes: List[Box] = []
+            heading_box_group = self._get_heading_box_group(div)
+            if heading_box_group:
+                body_headings.append(heading_box_group)
+                section_boxes.extend(heading_box_group.boxes)
+            for p in div.findall(f"./tei:p", NS):
+                paragraph_boxes: List[Box] = []
+                paragraph_sentences: List[BoxGroup] = []
+                for s in p.findall(f"./tei:s", NS):
+                    sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"])
+                    paragraph_sentences.append(BoxGroup(boxes=sentence_boxes))
+                    paragraph_boxes.extend(sentence_boxes)
+                body_paragraphs.append(BoxGroup(boxes=paragraph_boxes))
+                section_boxes.extend(paragraph_boxes)
+                body_sentences.extend(paragraph_sentences)
+            body_sections.append(BoxGroup(boxes=section_boxes))
+
+        return body_sections, body_headings, body_paragraphs, body_sentences
diff --git a/tests/test_parsers/test_grobid_augment_existing_document_parser.py b/tests/test_parsers/test_grobid_augment_existing_document_parser.py
index 73031283..f16256f6 100644
--- a/tests/test_parsers/test_grobid_augment_existing_document_parser.py
+++ b/tests/test_parsers/test_grobid_augment_existing_document_parser.py
@@ -89,6 +89,29 @@ def test_processes_full_text(self, mock_request):
                 assert m.box_group.metadata.target_id in bib_entry_grobid_ids
         assert mentions_with_targets == 66
 
+        # structured body text (sections, paragraphs, sentences)
+        assert len(augmented_doc.sections) is 20
+        assert len(augmented_doc.paragraphs) is 40
+        assert len(augmented_doc.sentences) is 249
+
+        for section in augmented_doc.sections:
+            assert len(section.headings) == 1
+            if section.id == 0:
+                assert section.headings[0].text == "1. Introduction"
+                assert section.headings[0].box_group.metadata.number == "1."
+                assert section.headings[0].box_group.metadata.title == "Introduction"
+            for paragraph in section.paragraphs:
+                if paragraph.id == 0:
+                    assert paragraph.text.startswith(
+                        "Research in remote sensing has been steadily increasing"
+                    )
+                    assert paragraph.sentences[-1].text.endswith(", etc.")
+                for sentence in paragraph.sentences:
+                    if sentence.id == 0:
+                        assert sentence.text.startswith(
+                            "Research in remote sensing has been steadily increasing"
+                        )
+
     @um.patch("requests.request", side_effect=mock_request)
     def test_passes_if_xml_missing_authors(self, mock_request):
         with open(PDFPLUMBER_DOC_PATH) as f_in:

From c102d355650f2680e9ab39675e2ef773c10f07a9 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 15 Aug 2023 14:26:22 -0700
Subject: [PATCH 3/4] versionne

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8a0523ea..a1371dbb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = 'mmda'
-version = '0.9.10'
+version = '0.9.11'
 description = 'MMDA - multimodal document analysis'
 authors = [
     {name = 'Allen Institute for Artificial Intelligence', email = 'contact@allenai.org'},

From 396e739c948c7e02d282d03e14d4a67fab23d3e8 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 15 Aug 2023 15:16:12 -0700
Subject: [PATCH 4/4] meant for this to be True

---
 src/mmda/parsers/grobid_augment_existing_document_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 452ee5f3..24f3ca27 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -113,7 +113,7 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
         )
         doc.annotate(
             headings=box_groups_to_span_groups(
-                heading_box_groups, doc, center=False
+                heading_box_groups, doc, center=True
             )
         )
         doc.annotate(