Merge pull request #374 from andremacola/main

Feat: extract pagetype from og:type or ld+json
extractus · Dec 5, 2023 · f84aec2 · f84aec2
2 parents 22e04cc + 2fe4d72
commit f84aec2
Show file tree

Hide file tree

Showing 6 changed files with 196 additions and 30 deletions.
diff --git a/index.d.ts b/index.d.ts
@@ -82,6 +82,7 @@ export interface ArticleData {
   source?: string;
   published?: string;
   ttr?: number;
+  type?: string;
 }
 
 export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>;

diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js
@@ -0,0 +1,71 @@
+const typeSchemas = [
+  'aboutpage',
+  'checkoutpage',
+  'collectionpage',
+  'contactpage',
+  'faqpage',
+  'itempage',
+  'medicalwebpage',
+  'profilepage',
+  'qapage',
+  'realestatelisting',
+  'searchresultspage',
+  'webpage',
+  'website',
+  'article',
+  'advertisercontentarticle',
+  'newsarticle',
+  'analysisnewsarticle',
+  'askpublicnewsarticle',
+  'backgroundnewsarticle',
+  'opinionnewsarticle',
+  'reportagenewsarticle',
+  'reviewnewsarticle',
+  'report',
+  'satiricalarticle',
+  'scholarlyarticle',
+  'medicalscholarlyarticle',
+]
+
+const attributeLists = {
+  description: 'description',
+  image: 'image',
+  author: 'author',
+  published: 'datePublished',
+  type: '@type',
+}
+
+/**
+ * Parses JSON-LD data from a document and populates an entry object.
+ * Only populates if the original entry object is empty or undefined.
+ *
+ * @param {Document} document - The HTML Document
+ * @param {Object} entry - The entry object to merge/populate with JSON-LD.
+ * @returns {Object} The entry object after being merged/populated with data.
+ */
+export default (document, entry) => {
+  const ldSchema = document.querySelector('script[type="application/ld+json"]')?.textContent
+
+  if (!ldSchema) {
+    return entry
+  }
+
+  const ldJson = JSON.parse(ldSchema)
+  Object.entries(attributeLists).forEach(([key, attr]) => {
+    if ((typeof entry[key] === 'undefined' || entry[key] === '') && ldJson[attr]) {
+      if (key === 'type' && typeof ldJson[attr] === 'string') {
+        return entry[key] = typeSchemas.includes(ldJson[attr].toLowerCase()) ? ldJson[attr].toLowerCase() : ''
+      }
+
+      if (typeof ldJson[attr] === 'string') {
+        return entry[key] = ldJson[attr].toLowerCase()
+      }
+
+      if (Array.isArray(ldJson[attr]) && typeof ldJson[attr][0] === 'string') {
+        return entry[key] = ldJson[attr][0].toLowerCase()
+      }
+    }
+  })
+
+  return entry
+}
diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js
@@ -1,10 +1,35 @@
 // utils -> extractMetaData
 
 import { DOMParser } from 'linkedom'
+import extractLdSchema from './extractLdSchema.js'
+
+/**
+ * @param {Element} node
+ * @param {Object} attributeLists
+ * @returns {?{key: string, content: string}}
+ */
+function getMetaContentByNameOrProperty (node, attributeLists) {
+  const content = node.getAttribute('content')
+  if (!content) return null
+
+  const property = node
+    .getAttribute('property')?.toLowerCase() ??
+    node.getAttribute('itemprop')?.toLowerCase()
+
+  const name = node.getAttribute('name')?.toLowerCase()
+
+  for (const [key, attrs] of Object.entries(attributeLists)) {
+    if (attrs.includes(property) || attrs.includes(name)) {
+      return { key, content }
+    }
+  }
+
+  return null
+}
 
 /**
  * @param html {string}
- * @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string}}
+ * @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string, type: string}}
  */
 export default (html) => {
   const entry = {
@@ -19,6 +44,7 @@ export default (html) => {
     source: '',
     published: '',
     favicon: '',
+    type: '',
   }
 
   const sourceAttrs = [
@@ -80,6 +106,20 @@ export default (html) => {
     'date',
     'parsely-pub-date',
   ]
+  const typeAttrs = [
+    'og:type',
+  ]
+
+  const attributeLists = {
+    source: sourceAttrs,
+    url: urlAttrs,
+    title: titleAttrs,
+    description: descriptionAttrs,
+    image: imageAttrs,
+    author: authorAttrs,
+    published: publishedTimeAttrs,
+    type: typeAttrs,
+  }
 
   const document = new DOMParser().parseFromString(html, 'text/html')
   entry.title = document.querySelector('head > title')?.innerText
@@ -96,35 +136,13 @@ export default (html) => {
   })
 
   Array.from(document.getElementsByTagName('meta')).forEach(node => {
-    const content = node.getAttribute('content')
-    if (!content) {
-      return false
-    }
-    const property = node.getAttribute('property')?.toLowerCase() ?? node.getAttribute('itemprop')?.toLowerCase()
-    const name = node.getAttribute('name')?.toLowerCase()
-
-    if (sourceAttrs.includes(property) || sourceAttrs.includes(name)) {
-      entry.source = content
-    }
-    if (urlAttrs.includes(property) || urlAttrs.includes(name)) {
-      entry.url = content
-    }
-    if (titleAttrs.includes(property) || titleAttrs.includes(name)) {
-      entry.title = content
-    }
-    if (descriptionAttrs.includes(property) || descriptionAttrs.includes(name)) {
-      entry.description = content
-    }
-    if (imageAttrs.includes(property) || imageAttrs.includes(name)) {
-      entry.image = content
-    }
-    if (authorAttrs.includes(property) || authorAttrs.includes(name)) {
-      entry.author = content
-    }
-    if (publishedTimeAttrs.includes(property) || publishedTimeAttrs.includes(name)) {
-      entry.published = content
+    const result = getMetaContentByNameOrProperty(node, attributeLists)
+    if (result) {
+      entry[result.key] = result.content
     }
   })
 
-  return entry
+  const entries = extractLdSchema(document, entry)
+
+  return entries
 }
diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js
@@ -7,7 +7,7 @@ import { isObject, hasProperty } from 'bellajs'
 
 import extractMetaData from './extractMetaData.js'
 
-const keys = 'url shortlink amphtml canonical title description image author source published favicon'.split(' ')
+const keys = 'url shortlink amphtml canonical title description image author source published favicon type'.split(' ')
 
 test('test extractMetaData(good content)', async () => {
   const html = readFileSync('./test-data/regular-article.html', 'utf8')
@@ -17,3 +17,12 @@ test('test extractMetaData(good content)', async () => {
     expect(hasProperty(result, k)).toBe(true)
   })
 })
+
+test('test extractMetaData(json ld schema content)', async () => {
+  const html = readFileSync('./test-data/regular-article-json-ld.html', 'utf8')
+  const result = extractMetaData(html)
+  expect(isObject(result)).toBe(true)
+  keys.forEach((k) => {
+    expect(hasProperty(result, k)).toBe(true)
+  })
+})
diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js
@@ -45,6 +45,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
     author,
     published,
     favicon: metaFav,
+    type,
   } = meta
 
   const {
@@ -127,5 +128,6 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
     source: getDomain(bestUrl),
     published,
     ttr: getTimeToRead(textContent, wordsPerMinute),
+    type,
   }
 }
diff --git a/test-data/regular-article-json-ld.html b/test-data/regular-article-json-ld.html
@@ -0,0 +1,65 @@
+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Article title here - ArticleParser</title>
+    <meta name="keywords" content="alpha, beta, gamma">
+    <meta name="twitter:site" content="@ArticleParser">
+    <meta name="twitter:url" content="https://somewhere.com/path/to/article-title-here">
+    <meta name="twitter:card" content="summary_large_image">
+    <meta name="twitter:creator" content="@alice">
+    <meta property="og:title" content="Article title here">
+    <meta property="og:url" content="https://somewhere.com/path/to/article-title-here">
+
+
+<script type="application/ld+json">
+    { "@context": "https://schema.org",
+      "@type": "Article",
+      "author": "Alice",
+      "image": [
+        "https://somewhere.com/path/to/image.jpg",
+        "https://somewhere.com/path/to/image2.jpg",
+        "https://somewhere.com/path/to/image3.jpg"
+      ],
+      "datePublished": "23\/01\/2014",
+      "dateCreated": "23\/01\/2014",
+      "description": "Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs."
+    }
+  </script>
+
+    <link rel="stylesheet" href="/path/to/cssfile.css">
+    <link rel="canonical" href="https://somewhere.com/another/path/to/article-title-here">
+    <link rel="amphtml" href="https://m.somewhere.com/another/path/to/article-title-here.amp">
+    <link rel="shortlink" href="https://sw.re/419283">
+    <link rel="icon" href="https://somewhere.com/favicon.ico">
+
+    <link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
+
+    <link rel="manifest" href="/manifest.json">
+  </head>
+  <body>
+    <header>Page header here</header>
+    <main>
+      <section>
+        <nav>Navigation here</nav>
+      </section>
+      <section>
+        <h1>Article title here</h1>
+        <article>
+          <div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
+          <p class="contentdetail">
+            Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
+          <p>The first fair dog is, in its own way, a lemon.</p>
+          <address>4746 Kelly Drive, West Virginia</address>
+          <img src="./orange.png" style="border: solid 1px #000">
+        </article>
+      </section>
+      <section class="sidebar-widget">
+        <widget>Some widget here</widget>
+        <widget>Some widget here</widget>
+      </section>
+    </main>
+    <footer>Page footer here</footer>
+  </body>
+</html>