working on asciidocs, struggling with ImageRef

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Oct 22, 2024 · 1c0a766 · 1c0a766
1 parent c23d049
commit 1c0a766
Show file tree

Hide file tree

Showing 6 changed files with 335 additions and 136 deletions.
diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py
@@ -11,6 +11,7 @@
     GroupLabel,
     TableCell,
     TableData,
+    ImageRef,
 )
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -83,14 +84,20 @@ def parse(self, doc: DoclingDocument):
 
         in_list = False
         in_table = False
+
+        text_data = []
         table_data = []
+        caption_data = []
 
         parents = {}
+        indents = {}
+
         for i in range(0, 10):
             parents[i] = None
-
+            indents[i] = None
+
         for line in self.lines:
-            line = line.strip()
+            #line = line.strip()
 
             # Title
             if self.is_title(line):
@@ -111,42 +118,106 @@ def parse(self, doc: DoclingDocument):
 
             # Lists
             elif self.is_list_item(line):
+
+                print("line: ", line)
+                item = self.parse_list_item(line)
+                print("parsed list-item: ", item)
+
+                level = self.get_current_level(parents)
+
                 if not in_list:
                     in_list = True
-
-                    level = self.get_current_level(parents)
 
                     parents[level+1] = doc.add_group(
                         parent=parents[level], name="list", label=GroupLabel.LIST
                     )
+                    indents[level+1] = item["indent"]
 
-                item = self.parse_list_item(line)
+                elif in_list and item["indent"]>indents[level]:
+                    parents[level+1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level+1] = item["indent"]
+
+                elif in_list and item["indent"]<indents[level]:                    
+
+                    print(item["indent"], " => ", indents[level])
+                    while item["indent"]<indents[level]:
+                        print(item["indent"], " => ", indents[level])
+                        parents[level] = None
+                        indents[level] = None
+                        level -= 1
+
                 doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
 
             elif in_list and not self.is_list_item(line):
                 in_list = False
 
                 level = self.get_current_level(parents)
                 parents[level]=None
-                
+
             # Tables
-            elif self.is_table_line(line):
+            elif line.strip()=="|===" and not in_table: # start of table
+                in_table = True
+
+            elif self.is_table_line(line): # within a table
                 in_table = True
                 table_data.append(self.parse_table_line(line))
 
-            elif in_table and not self.is_table_line(line):
+            elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table
 
+                caption = None
+                if len(caption_data)>0:
+                    caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
+
+                caption_data = []                
+
                 data = self.populate_table_as_grid(table_data)
-                doc.add_table(data=data, parent=self.get_current_parent(parents))
+                doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)
 
                 in_table = False
                 table_data = []
+
+            # Picture
+            elif self.is_picture(line):
+
+                caption = None
+                if len(caption_data)>0:
+                    caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)
+
+                caption_data = []                
+
+                item = self.parse_picture(line)
+                print(item)
+
+                image = ImageRef(mimetype="image/png", size=[100,100], dpi=70, uri=item["uri"])
+                doc.add_picture(image=image, caption=caption)
+
+            # Caption
+            elif self.is_caption(line) and len(caption_data)==0:
+                item = self.parse_caption(line)
+                caption_data.append(item["text"])
 
+            elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
+                item = self.parse_text(line)
+                caption_data.append(item["text"])
+
             # Plain text
-            elif len(line)>0:
+            elif len(line.strip())==0 and len(text_data)>0:
+                doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
+                             parent=self.get_current_parent(parents))
+                text_data = []
+
+            elif len(line.strip())>0: # allow multiline texts
+
                 item = self.parse_text(line)
-                doc.add_text(text=item["text"], label=DocItemLabel.PARAGRAPH, parent=self.get_current_parent(parents))
+                text_data.append(item["text"])
 
+        if len(text_data) > 0:
+            doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
+                         parent=self.get_current_parent(parents))
+            text_data = []
+
         if in_table and len(table_data) > 0:
             data = self.populate_table_as_grid(table_data)
             doc.add_table(data=data, parent=self.get_current_parent(parents))
@@ -170,14 +241,14 @@ def get_current_parent(self, parents):
 
         return None
 
-    # Title
+    #   =========   Title
     def is_title(self, line):
         return re.match(r"^= ", line)
 
     def parse_title(self, line):
         return {"type": "title", "text": line[2:].strip(), "level":0}
 
-    # Section headers
+    #   =========   Section headers
     def is_section_header(self, line):
         return re.match(r"^==+", line)
 
@@ -194,26 +265,31 @@ def parse_section_header(self, line):
             "text": text.strip(),
         }
 
-    # Lists
+    #   =========   Lists
     def is_list_item(self, line):
-        return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
+        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
 
     def parse_list_item(self, line):
         """Extract the item marker (number or bullet symbol) and the text of the item."""
 
-        match = re.match(r"^(\*|-|\d+\.)\s+(.*)", line)
+        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
         if match:
-            item_marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
-            item_text = match.group(2)    # The actual text of the list item
-            if item_marker=="*" or item_marker=="-":
-                return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": False}
+            indent = match.group(1)
+            marker = match.group(2)  # The list marker (e.g., "*", "-", "1.")
+            text = match.group(3)   # The actual text of the list item
+
+            if marker=="*" or marker=="-":
+                return {"type": "list_item", "marker": marker, "text": text.strip(),
+                        "numbered": False, "indent": 0 if indent==None else len(indent)}
             else:
-                return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": True}
+                return {"type": "list_item", "marker": marker, "text": text.strip(),
+                        "numbered": True, "indent": 0 if indent==None else len(indent)}
         else:
             # Fallback if no match
-            return {"type": "list_item", "marker": item_marker, "text": line, "numbered": False}
+            return {"type": "list_item", "marker": item_marker, "text": line,
+                    "numbered": False, "indent": 0}
 
-    # Tables
+    #   =========   Tables
     def is_table_line(self, line):
         return re.match(r"^\|.*\|", line)
 
@@ -252,6 +328,44 @@ def populate_table_as_grid(self, table_data):
 
         return data
 
-    # Plain text
+    #   =========   Pictures
+    def is_picture(self, line):
+        return re.match(r"^image::", line)
+
+    def parse_picture(self, line):
+        """
+        Parse an image macro, extracting its path and attributes.
+        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
+        """
+        mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
+        if mtch:
+            picture_path = mtch.group(1).strip()
+            attributes = mtch.group(2).split(',')
+            picture_info = {"type": "picture", "uri": picture_path}
+
+            # Extract optional attributes (alt text, width, height, alignment)
+            if attributes:
+                picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
+                for attr in attributes[1:]:
+                    key, value = attr.split('=')
+                    picture_info[key.strip()] = value.strip()
+
+            return picture_info
+
+        return {"type": "picture", "uri": line}
+
+    #   =========   Captions
+    def is_caption(self, line):
+        return re.match(r"^\.(.+)", line)
+
+    def parse_caption(self, line):
+        mtch = re.match(r"^\.(.+)", line)
+        if mtch:
+            text = mtch.group(1)
+            return {"type": "caption", "text": text}
+
+        return {"type": "caption", "text": ""}
+
+    #   =========   Plain text
     def parse_text(self, line):
-        return {"type": "text", "text": line}
+        return {"type": "text", "text": line.strip()}