Skip to content

Commit

Permalink
working on asciidocs, struggling with ImageRef
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Oct 22, 2024
1 parent c23d049 commit 1c0a766
Show file tree
Hide file tree
Showing 6 changed files with 335 additions and 136 deletions.
164 changes: 139 additions & 25 deletions docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
GroupLabel,
TableCell,
TableData,
ImageRef,
)

from docling.backend.abstract_backend import DeclarativeDocumentBackend
Expand Down Expand Up @@ -83,14 +84,20 @@ def parse(self, doc: DoclingDocument):

in_list = False
in_table = False

text_data = []
table_data = []
caption_data = []

parents = {}
indents = {}

for i in range(0, 10):
parents[i] = None

indents[i] = None

for line in self.lines:
line = line.strip()
#line = line.strip()

# Title
if self.is_title(line):
Expand All @@ -111,42 +118,106 @@ def parse(self, doc: DoclingDocument):

# Lists
elif self.is_list_item(line):

print("line: ", line)
item = self.parse_list_item(line)
print("parsed list-item: ", item)

level = self.get_current_level(parents)

if not in_list:
in_list = True

level = self.get_current_level(parents)

parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level+1] = item["indent"]

item = self.parse_list_item(line)
elif in_list and item["indent"]>indents[level]:
parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
indents[level+1] = item["indent"]

elif in_list and item["indent"]<indents[level]:

print(item["indent"], " => ", indents[level])
while item["indent"]<indents[level]:
print(item["indent"], " => ", indents[level])
parents[level] = None
indents[level] = None
level -= 1

doc.add_list_item(item["text"], parent=self.get_current_parent(parents))

elif in_list and not self.is_list_item(line):
in_list = False

level = self.get_current_level(parents)
parents[level]=None

# Tables
elif self.is_table_line(line):
elif line.strip()=="|===" and not in_table: # start of table
in_table = True

elif self.is_table_line(line): # within a table
in_table = True
table_data.append(self.parse_table_line(line))

elif in_table and not self.is_table_line(line):
elif in_table and ((not self.is_table_line(line)) or line.strip()=="|==="): # end of table

caption = None
if len(caption_data)>0:
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)

caption_data = []

data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents))
doc.add_table(data=data, parent=self.get_current_parent(parents), caption=caption)

in_table = False
table_data = []

# Picture
elif self.is_picture(line):

caption = None
if len(caption_data)>0:
caption = doc.add_text(text=" ".join(caption_data), label=DocItemLabel.CAPTION)

caption_data = []

item = self.parse_picture(line)
print(item)

image = ImageRef(mimetype="image/png", size=[100,100], dpi=70, uri=item["uri"])
doc.add_picture(image=image, caption=caption)

# Caption
elif self.is_caption(line) and len(caption_data)==0:
item = self.parse_caption(line)
caption_data.append(item["text"])

elif len(line.strip())>0 and len(caption_data)>0: # allow multiline captions
item = self.parse_text(line)
caption_data.append(item["text"])

# Plain text
elif len(line)>0:
elif len(line.strip())==0 and len(text_data)>0:
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents))
text_data = []

elif len(line.strip())>0: # allow multiline texts

item = self.parse_text(line)
doc.add_text(text=item["text"], label=DocItemLabel.PARAGRAPH, parent=self.get_current_parent(parents))
text_data.append(item["text"])

if len(text_data) > 0:
doc.add_text(text=" ".join(text_data), label=DocItemLabel.PARAGRAPH,
parent=self.get_current_parent(parents))
text_data = []

if in_table and len(table_data) > 0:
data = self.populate_table_as_grid(table_data)
doc.add_table(data=data, parent=self.get_current_parent(parents))
Expand All @@ -170,14 +241,14 @@ def get_current_parent(self, parents):

return None

# Title
# ========= Title
def is_title(self, line):
return re.match(r"^= ", line)

def parse_title(self, line):
return {"type": "title", "text": line[2:].strip(), "level":0}

# Section headers
# ========= Section headers
def is_section_header(self, line):
return re.match(r"^==+", line)

Expand All @@ -194,26 +265,31 @@ def parse_section_header(self, line):
"text": text.strip(),
}

# Lists
# ========= Lists
def is_list_item(self, line):
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)

def parse_list_item(self, line):
"""Extract the item marker (number or bullet symbol) and the text of the item."""

match = re.match(r"^(\*|-|\d+\.)\s+(.*)", line)
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
if match:
item_marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
item_text = match.group(2) # The actual text of the list item
if item_marker=="*" or item_marker=="-":
return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": False}
indent = match.group(1)
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
text = match.group(3) # The actual text of the list item

if marker=="*" or marker=="-":
return {"type": "list_item", "marker": marker, "text": text.strip(),
"numbered": False, "indent": 0 if indent==None else len(indent)}
else:
return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": True}
return {"type": "list_item", "marker": marker, "text": text.strip(),
"numbered": True, "indent": 0 if indent==None else len(indent)}
else:
# Fallback if no match
return {"type": "list_item", "marker": item_marker, "text": line, "numbered": False}
return {"type": "list_item", "marker": item_marker, "text": line,
"numbered": False, "indent": 0}

# Tables
# ========= Tables
def is_table_line(self, line):
return re.match(r"^\|.*\|", line)

Expand Down Expand Up @@ -252,6 +328,44 @@ def populate_table_as_grid(self, table_data):

return data

# Plain text
# ========= Pictures
def is_picture(self, line):
return re.match(r"^image::", line)

def parse_picture(self, line):
"""
Parse an image macro, extracting its path and attributes.
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
"""
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
if mtch:
picture_path = mtch.group(1).strip()
attributes = mtch.group(2).split(',')
picture_info = {"type": "picture", "uri": picture_path}

# Extract optional attributes (alt text, width, height, alignment)
if attributes:
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
for attr in attributes[1:]:
key, value = attr.split('=')
picture_info[key.strip()] = value.strip()

return picture_info

return {"type": "picture", "uri": line}

# ========= Captions
def is_caption(self, line):
return re.match(r"^\.(.+)", line)

def parse_caption(self, line):
mtch = re.match(r"^\.(.+)", line)
if mtch:
text = mtch.group(1)
return {"type": "caption", "text": text}

return {"type": "caption", "text": ""}

# ========= Plain text
def parse_text(self, line):
return {"type": "text", "text": line}
return {"type": "text", "text": line.strip()}
Loading

0 comments on commit 1c0a766

Please sign in to comment.