Merge pull request #375 from extractus/dev

v8.0.8
extractus · Dec 5, 2023 · 986a409 · 986a409
2 parents 22e04cc + 2ec2573
commit 986a409
Show file tree

Hide file tree

Showing 16 changed files with 226 additions and 56 deletions.
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
@@ -12,13 +12,13 @@ jobs:
 
     strategy:
       matrix:
-        node_version: [18.x, 20.x]
+        node_version: [18.x, 20.x, 21.x]
 
     steps:
     - uses: actions/checkout@v3
 
     - name: setup Node.js v${{ matrix.node_version }}
-      uses: actions/setup-node@v3
+      uses: actions/setup-node@v4
       with:
         node-version: ${{ matrix.node_version }}
 
@@ -31,8 +31,8 @@ jobs:
         npm run build --if-present
         npm run test
 
-    - name: Coveralls GitHub Action
-      uses: coverallsapp/github-action@1.1.3
+    - name: Report Coveralls
+      uses: coverallsapp/github-action@v2
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -43,6 +43,3 @@ jobs:
         key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
         restore-keys: |
           ${{ runner.os }}-node-
-
-
-
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -38,7 +38,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL

diff --git a/README.md b/README.md
@@ -105,6 +105,7 @@ The result - `article` - can be `null` or an object with the following structure
   favicon: String,
   content: String,
   published: Date String,
+  type: String, // page type
   source: String, // original publisher
   links: Array, // list of alternative links
   ttr: Number, // time to read in second, 0 = unknown

diff --git a/eval.js b/eval.js
@@ -6,7 +6,7 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs'
 import { slugify } from 'bellajs'
 
 import { isValid as isValidUrl } from './src/utils/linker.js'
-import { extract } from './src/main.js'
+import { extractFromHtml } from './src/main.js'
 
 if (!existsSync('evaluation')) {
   execSync('mkdir evaluation')
@@ -15,7 +15,12 @@ if (!existsSync('evaluation')) {
 const extractFromUrl = async (url) => {
   try {
     console.time('extraction')
-    const art = await extract(url)
+    const res = await fetch(url)
+    const buffer = await res.arrayBuffer()
+    const decoder = new TextDecoder('iso-8859-1')
+    const html = decoder.decode(buffer)
+
+    const art = await extractFromHtml(html)
     console.log(art)
     const slug = slugify(art.title)
     writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
@@ -28,7 +33,7 @@ const extractFromUrl = async (url) => {
 const extractFromFile = async (fpath) => {
   try {
     const html = readFileSync(fpath, 'utf8')
-    const art = await extract(html)
+    const art = await extractFromHtml(html)
     console.log(art)
   } catch (err) {
     console.trace(err)

diff --git a/examples/browser-article-parser/package.json b/examples/browser-article-parser/package.json
@@ -7,6 +7,6 @@
   },
   "dependencies": {
     "express": "^4.18.2",
-    "got": "^13.0.0"
+    "got": "^14.0.0"
   }
 }
diff --git a/examples/bun-article-parser/package.json b/examples/bun-article-parser/package.json
@@ -5,10 +5,10 @@
     "start": "bun run index.ts"
   },
   "devDependencies": {
-    "bun-types": "^0.6.13"
+    "bun-types": "^1.0.15"
   },
   "dependencies": {
     "@extractus/article-extractor": "latest",
-    "hono": "^3.2.7"
+    "hono": "^3.11.2"
   }
 }
diff --git a/examples/deno-article-parser/deno.json b/examples/deno-article-parser/deno.json
@@ -2,8 +2,8 @@
   "name": "deno-article-parser",
   "version": "1.0.0",
   "imports": {
-    "serve": "https://deno.land/std@0.203.0/http/server.ts",
-    "hono": "https://deno.land/x/hono@v3.7.2/mod.ts",
+    "serve": "https://deno.land/std/http/server.ts",
+    "hono": "https://deno.land/x/hono@v3.11.2/mod.ts",
     "article-extractor": "https://esm.sh/@extractus/article-extractor"
   },
   "tasks": {

diff --git a/examples/tsnode-article-parser/package.json b/examples/tsnode-article-parser/package.json
@@ -2,12 +2,13 @@
   "name": "tsnode-article-parser",
   "version": "1.0.0",
   "main": "index.ts",
+  "type": "module",
   "scripts": {
     "prestart": "npx tsc",
     "start": "node dist/index.js"
   },
   "devDependencies": {
-    "typescript": "^5.1.6"
+    "typescript": "^5.3.2"
   },
   "dependencies": {
     "@extractus/article-extractor": "latest",

diff --git a/examples/tsnode-article-parser/tsconfig.json b/examples/tsnode-article-parser/tsconfig.json
@@ -1,8 +1,8 @@
 {
   "compilerOptions": {
-    "module": "commonjs",
+    "module": "es6",
     "esModuleInterop": true,
-    "target": "es6",
+    "target": "esnext",
     "moduleResolution": "node",
     "sourceMap": true,
     "outDir": "dist"

diff --git a/index.d.ts b/index.d.ts
@@ -82,6 +82,7 @@ export interface ArticleData {
   source?: string;
   published?: string;
   ttr?: number;
+  type?: string;
 }
 
 export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>;

diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "8.0.3",
+  "version": "8.0.4",
   "name": "@extractus/article-extractor",
   "description": "To extract main article from given URL",
   "homepage": "https://github.com/extractus/article-extractor",
@@ -33,15 +33,15 @@
     "@mozilla/readability": "^0.4.4",
     "bellajs": "^11.1.2",
     "cross-fetch": "^4.0.0",
-    "linkedom": "^0.15.1",
+    "linkedom": "^0.16.4",
     "sanitize-html": "2.11.0"
   },
   "devDependencies": {
-    "@types/sanitize-html": "^2.9.0",
-    "eslint": "^8.47.0",
-    "https-proxy-agent": "^7.0.1",
-    "jest": "^29.6.2",
-    "nock": "^13.3.2"
+    "@types/sanitize-html": "^2.9.5",
+    "eslint": "^8.55.0",
+    "https-proxy-agent": "^7.0.2",
+    "jest": "^29.7.0",
+    "nock": "^13.4.0"
   },
   "keywords": [
     "article",

diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js
@@ -0,0 +1,71 @@
+const typeSchemas = [
+  'aboutpage',
+  'checkoutpage',
+  'collectionpage',
+  'contactpage',
+  'faqpage',
+  'itempage',
+  'medicalwebpage',
+  'profilepage',
+  'qapage',
+  'realestatelisting',
+  'searchresultspage',
+  'webpage',
+  'website',
+  'article',
+  'advertisercontentarticle',
+  'newsarticle',
+  'analysisnewsarticle',
+  'askpublicnewsarticle',
+  'backgroundnewsarticle',
+  'opinionnewsarticle',
+  'reportagenewsarticle',
+  'reviewnewsarticle',
+  'report',
+  'satiricalarticle',
+  'scholarlyarticle',
+  'medicalscholarlyarticle',
+]
+
+const attributeLists = {
+  description: 'description',
+  image: 'image',
+  author: 'author',
+  published: 'datePublished',
+  type: '@type',
+}
+
+/**
+ * Parses JSON-LD data from a document and populates an entry object.
+ * Only populates if the original entry object is empty or undefined.
+ *
+ * @param {Document} document - The HTML Document
+ * @param {Object} entry - The entry object to merge/populate with JSON-LD.
+ * @returns {Object} The entry object after being merged/populated with data.
+ */
+export default (document, entry) => {
+  const ldSchema = document.querySelector('script[type="application/ld+json"]')?.textContent
+
+  if (!ldSchema) {
+    return entry
+  }
+
+  const ldJson = JSON.parse(ldSchema)
+  Object.entries(attributeLists).forEach(([key, attr]) => {
+    if ((typeof entry[key] === 'undefined' || entry[key] === '') && ldJson[attr]) {
+      if (key === 'type' && typeof ldJson[attr] === 'string') {
+        return entry[key] = typeSchemas.includes(ldJson[attr].toLowerCase()) ? ldJson[attr].toLowerCase() : ''
+      }
+
+      if (typeof ldJson[attr] === 'string') {
+        return entry[key] = ldJson[attr].toLowerCase()
+      }
+
+      if (Array.isArray(ldJson[attr]) && typeof ldJson[attr][0] === 'string') {
+        return entry[key] = ldJson[attr][0].toLowerCase()
+      }
+    }
+  })
+
+  return entry
+}
diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js
@@ -1,10 +1,35 @@
 // utils -> extractMetaData
 
 import { DOMParser } from 'linkedom'
+import extractLdSchema from './extractLdSchema.js'
+
+/**
+ * @param {Element} node
+ * @param {Object} attributeLists
+ * @returns {?{key: string, content: string}}
+ */
+function getMetaContentByNameOrProperty (node, attributeLists) {
+  const content = node.getAttribute('content')
+  if (!content) return null
+
+  const property = node
+    .getAttribute('property')?.toLowerCase() ??
+    node.getAttribute('itemprop')?.toLowerCase()
+
+  const name = node.getAttribute('name')?.toLowerCase()
+
+  for (const [key, attrs] of Object.entries(attributeLists)) {
+    if (attrs.includes(property) || attrs.includes(name)) {
+      return { key, content }
+    }
+  }
+
+  return null
+}
 
 /**
  * @param html {string}
- * @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string}}
+ * @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string, type: string}}
  */
 export default (html) => {
   const entry = {
@@ -19,6 +44,7 @@ export default (html) => {
     source: '',
     published: '',
     favicon: '',
+    type: '',
   }
 
   const sourceAttrs = [
@@ -80,6 +106,20 @@ export default (html) => {
     'date',
     'parsely-pub-date',
   ]
+  const typeAttrs = [
+    'og:type',
+  ]
+
+  const attributeLists = {
+    source: sourceAttrs,
+    url: urlAttrs,
+    title: titleAttrs,
+    description: descriptionAttrs,
+    image: imageAttrs,
+    author: authorAttrs,
+    published: publishedTimeAttrs,
+    type: typeAttrs,
+  }
 
   const document = new DOMParser().parseFromString(html, 'text/html')
   entry.title = document.querySelector('head > title')?.innerText
@@ -96,35 +136,13 @@ export default (html) => {
   })
 
   Array.from(document.getElementsByTagName('meta')).forEach(node => {
-    const content = node.getAttribute('content')
-    if (!content) {
-      return false
-    }
-    const property = node.getAttribute('property')?.toLowerCase() ?? node.getAttribute('itemprop')?.toLowerCase()
-    const name = node.getAttribute('name')?.toLowerCase()
-
-    if (sourceAttrs.includes(property) || sourceAttrs.includes(name)) {
-      entry.source = content
-    }
-    if (urlAttrs.includes(property) || urlAttrs.includes(name)) {
-      entry.url = content
-    }
-    if (titleAttrs.includes(property) || titleAttrs.includes(name)) {
-      entry.title = content
-    }
-    if (descriptionAttrs.includes(property) || descriptionAttrs.includes(name)) {
-      entry.description = content
-    }
-    if (imageAttrs.includes(property) || imageAttrs.includes(name)) {
-      entry.image = content
-    }
-    if (authorAttrs.includes(property) || authorAttrs.includes(name)) {
-      entry.author = content
-    }
-    if (publishedTimeAttrs.includes(property) || publishedTimeAttrs.includes(name)) {
-      entry.published = content
+    const result = getMetaContentByNameOrProperty(node, attributeLists)
+    if (result) {
+      entry[result.key] = result.content
     }
   })
 
-  return entry
+  const entries = extractLdSchema(document, entry)
+
+  return entries
 }
diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js
@@ -7,7 +7,7 @@ import { isObject, hasProperty } from 'bellajs'
 
 import extractMetaData from './extractMetaData.js'
 
-const keys = 'url shortlink amphtml canonical title description image author source published favicon'.split(' ')
+const keys = 'url shortlink amphtml canonical title description image author source published favicon type'.split(' ')
 
 test('test extractMetaData(good content)', async () => {
   const html = readFileSync('./test-data/regular-article.html', 'utf8')
@@ -17,3 +17,12 @@ test('test extractMetaData(good content)', async () => {
     expect(hasProperty(result, k)).toBe(true)
   })
 })
+
+test('test extractMetaData(json ld schema content)', async () => {
+  const html = readFileSync('./test-data/regular-article-json-ld.html', 'utf8')
+  const result = extractMetaData(html)
+  expect(isObject(result)).toBe(true)
+  keys.forEach((k) => {
+    expect(hasProperty(result, k)).toBe(true)
+  })
+})
diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js
@@ -45,6 +45,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
     author,
     published,
     favicon: metaFav,
+    type,
   } = meta
 
   const {
@@ -127,5 +128,6 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
     source: getDomain(bestUrl),
     published,
     ttr: getTimeToRead(textContent, wordsPerMinute),
+    type,
   }
 }