Skip to content

Commit

Permalink
Merge pull request #375 from extractus/dev
Browse files Browse the repository at this point in the history
v8.0.8
  • Loading branch information
ndaidong authored Dec 5, 2023
2 parents 22e04cc + 2ec2573 commit 986a409
Show file tree
Hide file tree
Showing 16 changed files with 226 additions and 56 deletions.
11 changes: 4 additions & 7 deletions .github/workflows/ci-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ jobs:

strategy:
matrix:
node_version: [18.x, 20.x]
node_version: [18.x, 20.x, 21.x]

steps:
- uses: actions/checkout@v3

- name: setup Node.js v${{ matrix.node_version }}
uses: actions/setup-node@v3
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node_version }}

Expand All @@ -31,8 +31,8 @@ jobs:
npm run build --if-present
npm run test
- name: Coveralls GitHub Action
uses: coverallsapp/github-action@1.1.3
- name: Report Coveralls
uses: coverallsapp/github-action@v2
with:
github-token: ${{ secrets.GITHUB_TOKEN }}

Expand All @@ -43,6 +43,3 @@ jobs:
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ The result - `article` - can be `null` or an object with the following structure
favicon: String,
content: String,
published: Date String,
type: String, // page type
source: String, // original publisher
links: Array, // list of alternative links
ttr: Number, // time to read in second, 0 = unknown
Expand Down
11 changes: 8 additions & 3 deletions eval.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs'
import { slugify } from 'bellajs'

import { isValid as isValidUrl } from './src/utils/linker.js'
import { extract } from './src/main.js'
import { extractFromHtml } from './src/main.js'

if (!existsSync('evaluation')) {
execSync('mkdir evaluation')
Expand All @@ -15,7 +15,12 @@ if (!existsSync('evaluation')) {
const extractFromUrl = async (url) => {
try {
console.time('extraction')
const art = await extract(url)
const res = await fetch(url)
const buffer = await res.arrayBuffer()
const decoder = new TextDecoder('iso-8859-1')
const html = decoder.decode(buffer)

const art = await extractFromHtml(html)
console.log(art)
const slug = slugify(art.title)
writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
Expand All @@ -28,7 +33,7 @@ const extractFromUrl = async (url) => {
const extractFromFile = async (fpath) => {
try {
const html = readFileSync(fpath, 'utf8')
const art = await extract(html)
const art = await extractFromHtml(html)
console.log(art)
} catch (err) {
console.trace(err)
Expand Down
2 changes: 1 addition & 1 deletion examples/browser-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
},
"dependencies": {
"express": "^4.18.2",
"got": "^13.0.0"
"got": "^14.0.0"
}
}
4 changes: 2 additions & 2 deletions examples/bun-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
"start": "bun run index.ts"
},
"devDependencies": {
"bun-types": "^0.6.13"
"bun-types": "^1.0.15"
},
"dependencies": {
"@extractus/article-extractor": "latest",
"hono": "^3.2.7"
"hono": "^3.11.2"
}
}
4 changes: 2 additions & 2 deletions examples/deno-article-parser/deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"name": "deno-article-parser",
"version": "1.0.0",
"imports": {
"serve": "https://deno.land/std@0.203.0/http/server.ts",
"hono": "https://deno.land/x/hono@v3.7.2/mod.ts",
"serve": "https://deno.land/std/http/server.ts",
"hono": "https://deno.land/x/hono@v3.11.2/mod.ts",
"article-extractor": "https://esm.sh/@extractus/article-extractor"
},
"tasks": {
Expand Down
3 changes: 2 additions & 1 deletion examples/tsnode-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
"name": "tsnode-article-parser",
"version": "1.0.0",
"main": "index.ts",
"type": "module",
"scripts": {
"prestart": "npx tsc",
"start": "node dist/index.js"
},
"devDependencies": {
"typescript": "^5.1.6"
"typescript": "^5.3.2"
},
"dependencies": {
"@extractus/article-extractor": "latest",
Expand Down
4 changes: 2 additions & 2 deletions examples/tsnode-article-parser/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"compilerOptions": {
"module": "commonjs",
"module": "es6",
"esModuleInterop": true,
"target": "es6",
"target": "esnext",
"moduleResolution": "node",
"sourceMap": true,
"outDir": "dist"
Expand Down
1 change: 1 addition & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ export interface ArticleData {
source?: string;
published?: string;
ttr?: number;
type?: string;
}

export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>;
Expand Down
14 changes: 7 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "8.0.3",
"version": "8.0.4",
"name": "@extractus/article-extractor",
"description": "To extract main article from given URL",
"homepage": "https://github.com/extractus/article-extractor",
Expand Down Expand Up @@ -33,15 +33,15 @@
"@mozilla/readability": "^0.4.4",
"bellajs": "^11.1.2",
"cross-fetch": "^4.0.0",
"linkedom": "^0.15.1",
"linkedom": "^0.16.4",
"sanitize-html": "2.11.0"
},
"devDependencies": {
"@types/sanitize-html": "^2.9.0",
"eslint": "^8.47.0",
"https-proxy-agent": "^7.0.1",
"jest": "^29.6.2",
"nock": "^13.3.2"
"@types/sanitize-html": "^2.9.5",
"eslint": "^8.55.0",
"https-proxy-agent": "^7.0.2",
"jest": "^29.7.0",
"nock": "^13.4.0"
},
"keywords": [
"article",
Expand Down
71 changes: 71 additions & 0 deletions src/utils/extractLdSchema.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
const typeSchemas = [
'aboutpage',
'checkoutpage',
'collectionpage',
'contactpage',
'faqpage',
'itempage',
'medicalwebpage',
'profilepage',
'qapage',
'realestatelisting',
'searchresultspage',
'webpage',
'website',
'article',
'advertisercontentarticle',
'newsarticle',
'analysisnewsarticle',
'askpublicnewsarticle',
'backgroundnewsarticle',
'opinionnewsarticle',
'reportagenewsarticle',
'reviewnewsarticle',
'report',
'satiricalarticle',
'scholarlyarticle',
'medicalscholarlyarticle',
]

const attributeLists = {
description: 'description',
image: 'image',
author: 'author',
published: 'datePublished',
type: '@type',
}

/**
* Parses JSON-LD data from a document and populates an entry object.
* Only populates if the original entry object is empty or undefined.
*
* @param {Document} document - The HTML Document
* @param {Object} entry - The entry object to merge/populate with JSON-LD.
* @returns {Object} The entry object after being merged/populated with data.
*/
export default (document, entry) => {
const ldSchema = document.querySelector('script[type="application/ld+json"]')?.textContent

if (!ldSchema) {
return entry
}

const ldJson = JSON.parse(ldSchema)
Object.entries(attributeLists).forEach(([key, attr]) => {
if ((typeof entry[key] === 'undefined' || entry[key] === '') && ldJson[attr]) {
if (key === 'type' && typeof ldJson[attr] === 'string') {
return entry[key] = typeSchemas.includes(ldJson[attr].toLowerCase()) ? ldJson[attr].toLowerCase() : ''
}

if (typeof ldJson[attr] === 'string') {
return entry[key] = ldJson[attr].toLowerCase()
}

if (Array.isArray(ldJson[attr]) && typeof ldJson[attr][0] === 'string') {
return entry[key] = ldJson[attr][0].toLowerCase()
}
}
})

return entry
}
76 changes: 47 additions & 29 deletions src/utils/extractMetaData.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,35 @@
// utils -> extractMetaData

import { DOMParser } from 'linkedom'
import extractLdSchema from './extractLdSchema.js'

/**
* @param {Element} node
* @param {Object} attributeLists
* @returns {?{key: string, content: string}}
*/
function getMetaContentByNameOrProperty (node, attributeLists) {
const content = node.getAttribute('content')
if (!content) return null

const property = node
.getAttribute('property')?.toLowerCase() ??
node.getAttribute('itemprop')?.toLowerCase()

const name = node.getAttribute('name')?.toLowerCase()

for (const [key, attrs] of Object.entries(attributeLists)) {
if (attrs.includes(property) || attrs.includes(name)) {
return { key, content }
}
}

return null
}

/**
* @param html {string}
* @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string}}
* @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string, type: string}}
*/
export default (html) => {
const entry = {
Expand All @@ -19,6 +44,7 @@ export default (html) => {
source: '',
published: '',
favicon: '',
type: '',
}

const sourceAttrs = [
Expand Down Expand Up @@ -80,6 +106,20 @@ export default (html) => {
'date',
'parsely-pub-date',
]
const typeAttrs = [
'og:type',
]

const attributeLists = {
source: sourceAttrs,
url: urlAttrs,
title: titleAttrs,
description: descriptionAttrs,
image: imageAttrs,
author: authorAttrs,
published: publishedTimeAttrs,
type: typeAttrs,
}

const document = new DOMParser().parseFromString(html, 'text/html')
entry.title = document.querySelector('head > title')?.innerText
Expand All @@ -96,35 +136,13 @@ export default (html) => {
})

Array.from(document.getElementsByTagName('meta')).forEach(node => {
const content = node.getAttribute('content')
if (!content) {
return false
}
const property = node.getAttribute('property')?.toLowerCase() ?? node.getAttribute('itemprop')?.toLowerCase()
const name = node.getAttribute('name')?.toLowerCase()

if (sourceAttrs.includes(property) || sourceAttrs.includes(name)) {
entry.source = content
}
if (urlAttrs.includes(property) || urlAttrs.includes(name)) {
entry.url = content
}
if (titleAttrs.includes(property) || titleAttrs.includes(name)) {
entry.title = content
}
if (descriptionAttrs.includes(property) || descriptionAttrs.includes(name)) {
entry.description = content
}
if (imageAttrs.includes(property) || imageAttrs.includes(name)) {
entry.image = content
}
if (authorAttrs.includes(property) || authorAttrs.includes(name)) {
entry.author = content
}
if (publishedTimeAttrs.includes(property) || publishedTimeAttrs.includes(name)) {
entry.published = content
const result = getMetaContentByNameOrProperty(node, attributeLists)
if (result) {
entry[result.key] = result.content
}
})

return entry
const entries = extractLdSchema(document, entry)

return entries
}
11 changes: 10 additions & 1 deletion src/utils/extractMetaData.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { isObject, hasProperty } from 'bellajs'

import extractMetaData from './extractMetaData.js'

const keys = 'url shortlink amphtml canonical title description image author source published favicon'.split(' ')
const keys = 'url shortlink amphtml canonical title description image author source published favicon type'.split(' ')

test('test extractMetaData(good content)', async () => {
const html = readFileSync('./test-data/regular-article.html', 'utf8')
Expand All @@ -17,3 +17,12 @@ test('test extractMetaData(good content)', async () => {
expect(hasProperty(result, k)).toBe(true)
})
})

test('test extractMetaData(json ld schema content)', async () => {
const html = readFileSync('./test-data/regular-article-json-ld.html', 'utf8')
const result = extractMetaData(html)
expect(isObject(result)).toBe(true)
keys.forEach((k) => {
expect(hasProperty(result, k)).toBe(true)
})
})
2 changes: 2 additions & 0 deletions src/utils/parseFromHtml.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
author,
published,
favicon: metaFav,
type,
} = meta

const {
Expand Down Expand Up @@ -127,5 +128,6 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
source: getDomain(bestUrl),
published,
ttr: getTimeToRead(textContent, wordsPerMinute),
type,
}
}
Loading

0 comments on commit 986a409

Please sign in to comment.