Skip to content

Cheat sheet

tato edited this page Mar 8, 2023 · 1 revision

Nokolexbor cheat sheet

Creating documents

Everything on Nokolexbor starts with creating a Document.

# Create a blank document by `new`
doc = Nokolexbor::Document.new
# Create a blank document by parsing an empty string
doc = Nokolexbor::HTML('')
# Create a document by parsing a string of HTML
doc = Nokolexbor::HTML('<div></div>')
# Create a document by passing any object that responds to `#read`
doc = Nokolexbor::HTML(URI.open('https://github.com/serpapi/nokolexbor'))

Working with documents

doc = Nokolexbor.HTML <<-HTML
<html>
  <head><title>This is title</title></head>
  <body>
    <div class="a">
      <span>Text</span>
    </div>
  </body>
</html>
HTML

# Get the title
doc.title
# => "This is title"

# Set the title
doc.title = 'New title'
doc.at_css('head').to_html
# => "<head><title>New title</title></head>"

# Get the root node
doc.root
# => #<Nokolexbor::Element <html>>

# The document itself is a `Node`, it refers to the root node of the document, so every method of `Node` applies to `Document`. For example, searching with css selector:
doc.at_css('div')
# => #<Nokolexbor::Element <div class="a">>

Creating nodes

# Before creating nodes, you need a document
doc = Nokolexbor::HTML('')

# Create an element
doc.create_element("div") # <div></div>
Nokolexbor::Element.new('div', doc) # <div></div>
doc.create_element("div", class: "a") # <div class="a"></div>
doc.create_element("div", "Text") # <div>Text</div>
doc.create_element("div", "Text", class: "a", style: "b") # <div class="a" style="b">Text</div>
doc.create_element("div") { |node| node['class'] = "a" } # <div class="a"></div>

# Create a text node
doc.create_text_node("Some text") # => Some text
Nokolexbor::Text.new("Some text", doc) # => Some text

# Create a comment
doc.create_comment("Some comment") # <!--Some comment-->
Nokolexbor::Comment.new("Some comment", doc) # <!--Some comment-->

# Create a cdata
doc.create_cdata("Some CDATA")
Nokolexbor::CDATA.new("Some CDATA", doc)

# Create a processing instruction
Nokolexbor::ProcessingInstruction.new("xml", "some data", doc) # <?xml some data>

Searching nodes

doc = Nokolexbor.HTML <<-HTML
  <div>
    <span class="a">Text 1</span>
    <span class="b">Text 2</span>
    <span class="a">Text 3</span>
    <span class="b">Text 4</span>
    <span class="a">Text 5</span>
    <span class="b">Text 6</span>
    <div><span>Text 7</span>Text 8</div>
  </div>
HTML
# Search for all the matching nodes with css selector
doc.css('div > span.a')
# Search for all the matching nodes with multiple css selectors
doc.css('div > span.a, div > div')
# Search for all the matching nodes among the children of the called node, with multiple css selectors
doc.at_css('div').css('> span.a, > div')
# Search for the first matching node with css selector
# `#at_css` if faster than `#css` if you only need the first result
doc.at_css('div > span.a')
# Search for the first matching text node with css selector
doc.at_css('div > div > ::text')

# -------------
# Search for all the matching nodes with xpath
doc.xpath('//div / span[@class="a"]')
# Search for all the matching nodes with multiple xpath
doc.xpath('//div / span[@class="a"]', '//div / div')
# Search for the first matching node with css selector
doc.at_xpath('//div / span[@class="a"]')
# Search for the first matching text node with css selector
doc.at_xpath('//div / div / text()')

Working with nodes

node.document # Get owner document

node.name # Get node name
node.name= # Set node name

# Node type
node.type # enum of type
node.cdata?
node.comment?
node.element?
node.fragment?
node.document?
node.text?

# Attributes
node['href'] # Get attribute value (String)
node['href'] = 'http://example.com' # Set attribute value
node.key?('href') # Has attribute?
node.keys # Get an array of attribute names
node.values # Get an array of attribute values
node.delete('href') # Delete attribute
node.each { |attr_name, attr_value| } # Iterate attributes

# Attributes nodes (Nokolexbor::Attribute)
node.attribute('href') # Get attribute node
node.attribute_nodes # Get an array of the attribute nodes
node.attributes # Get a hash of the attribute nodes, it's keys are attribute names

# Inserting / modifying nodes
# The param of all the methods of this section can be: String, Node, DocumentFragment, NodeSet
param = '<a>123</a><a>456</a>' # String
param = doc.create_element('div') # Node
param = doc.fragment('<a>123</a><a>456</a>') # DocumentFragment
param = doc.css('div') # NodeSet
# Add `param` as previous sibling
node.previous=(param) 
node.before(param)
node.add_previous_sibling(param)
# Add `param` as next sibling
node.next=(param) 
node.after(param)
node.add_next_sibling(param)
# Add `param` as its child
node.add_child(param)
node << param
# Replace the node (including itself) with `param`
node.replace(param)
node.swap(param)
# Wrap the node with another node.
node.wrap("<div class='container'></div>")
# Replace the node's children with `param`
node.children=(param)
node.inner_html=(param)
# Re-parent the node
node.parent=(node)
# Replace the node's content with a text node containing `string`.
node.content=(string) 

# Deleting nodes
node.remove # Removed nodes can be re-attached to doc.

# Traversing
node.traverse { |node| } # Yields all children recursively.
node.next # Get next node.
node.next_element # Get next node of type Element.
node.previous # Get previous node.
node.previous_element # Get previous node of type Element.
node.parent # Get parent node
node.child # Get first child
node.children # Get the list of children as a NodeSet
node.elements # Get the list of element children of this node as a NodeSet.
node.first_element_child # Get the first child node of this node that is an element.
node.last_element_child # Get the last child node of this node that is an element.
node.ancestors # List ancestor nodes, closest to furthest, as a NodeSet.
node.ancestors(selector) # Filter the ancestors that match the selector. (Not good in performance)

# Serialization
# Get the text content of the node
node.content 
node.text
node.inner_text
node.to_str
# Get the inner html of the node
node.inner_html
# Get the outer html of the node
node.outer_html
node.to_html
node.serialize
node.to_s

# Self testing
node.matches?(selector) # Does this node match this selector? (Not good in performance)

# Rubyisms
node == another_node # Returns true if the nodes are the same one in memory.
node.clone # Copy this node.

# Write it out to an IO object that responds to `#write`
node.write_to(io)

# Inspection
node.inspect

# Utility
node.fragment(param) # Create a DocumentFragment containing tags that is relative to this context node.
node.parse(string) # Parse `string` as a document fragment within the context of this node.

Working with attributes

doc = Nokolexbor.HTML('<div class="a" href="b" style="c"></div>')
node = doc.at_css('div')

# Get attribute (Nokolexbor::Attribute)
attr = node.attribute('class')
# => #<Nokolexbor::Attribute class="a">

# Get name and value
attr.name # "class"
attr.value # "a"

# Set name and value
attr.name = "class1"
attr.value = "a1"
node.to_html # <div class1="a1" href="b" style="c"></div>

attr.parent # => #<Nokolexbor::Element <div class1="a1" href="b" style="c">>

# Traversing
attr.next # => #<Nokolexbor::Attribute href="b">
attr.next.previous == attr # => true

Working with NodeSet

# Create NodeSet
doc = Nokolexbor::HTML('<div class="a"></div><div class="b"></div>')
nodes = Nokolexbor::NodeSet.new(doc, [])

# Get NodeSet by searching methods
nodes = doc.css('div')
nodes = doc.xpath('//div')

# Set operations
nodes | other_nodeset # Union, return a new NodeSet with merged nodes, excluding dupliates
nodes + other_nodeset # Union, return a new NodeSet with merged nodes, excluding dupliates
nodes & other_nodeset # Intersection, return a new NodeSet with the common nodes only
nodes - other_nodeset # Difference, return a new NodeSet with the nodes in this NodeSet that aren't in other_nodeset
nodes.include?(node)
nodes.empty?
nodes.length
nodes.size
nodes.delete(node)

# List operations (includes Enumerable)
nodes.each { |node| }
nodes.map { |node| }
nodes.select { |node| }
nodes.find { |node| }
nodes.first
nodes.last
nodes.reverse
nodes.index(node) # Returns the numeric index or nil
nodes[3] # Get element at index 3
nodes[3, 4] # Return a NodeSet of size 4, starting at index 3
nodes[3..6] # Return a NodeSet using a range of indexes
nodes.pop
nodes.push(node)
nodes.shift

nodes.children # Returns a new NodeSet containing all the children of all the nodes in the NodeSet

# Serialization
nodes.content 
nodes.text
nodes.inner_text
nodes.to_str
# Get the inner html of the nodes
nodes.inner_html
# Get the outer html of the nodes
nodes.outer_html
nodes.to_html
nodes.serialize
nodes.to_s

# Batch operations on nodes
nodes.remove # Remove all its containing nodes.
nodes.wrap("<div class='container'></div>") # Wrap all containing nodes.
nodes.before(datum) # Insert datum before the first Node in this NodeSet # e.g. first.before(datum)
nodes.after(datum) # Insert datum after the last Node in this NodeSet # e.g. last.after(datum)
nodes.attr(key, value) # Set attribute on all containing nodes.
nodes.attr(key) { |node| 'value' } # Set attribute on all containing nodes.
nodes.remove_attr(name) # Remove attribute from all containing nodes.
nodes.add_class(name) # Append class to all containing nodes.

# Searching
nodes.css(selectors)
nodes.at_css(selectors)
nodes.xpath(paths)
nodes.at_xpath(paths)

# Convert to array
nodes.to_a

# Rubyisms
nodes == nodes # Two NodeSets are equal if they contain the same number of elements and if each element is equal to the corresponding element in the other NodeSet
nodes.dup # Duplicate this node set
nodes.inspect