diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js
index d3f1f53..8504fc1 100644
--- a/src/utils/extractMetaData.js
+++ b/src/utils/extractMetaData.js
@@ -146,7 +146,7 @@ export default (html) => {
const metadata = extractLdSchema(doc, entry)
if (!metadata.published) {
- metadata.published = findDate(doc) || ''
+ metadata.published = findDate(doc, metadata) || ''
}
return metadata
diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js
index f200d87..387c04b 100644
--- a/src/utils/extractMetaData.test.js
+++ b/src/utils/extractMetaData.test.js
@@ -38,20 +38,25 @@ describe('test extractMetaData', () => {
it('test extractMetaData(find date)', async () => {
const html1 = readFileSync('./test-data/regular-article-date-time.html', 'utf8')
const html2 = readFileSync('./test-data/regular-article-date-itemprop.html', 'utf8')
- const html3 = readFileSync('./test-data/regular-article-date-span.html', 'utf8')
+ const html3 = readFileSync('./test-data/regular-article-date-url.html', 'utf8')
+ const html4 = readFileSync('./test-data/regular-article-date-span.html', 'utf8')
const result1 = extractMetaData(html1)
const result2 = extractMetaData(html2)
const result3 = extractMetaData(html3)
+ const result4 = extractMetaData(html4)
assert.ok(isObject(result1))
assert.ok(isObject(result2))
assert.ok(isObject(result3))
+ assert.ok(isObject(result4))
keys.forEach((k) => {
assert.ok(hasProperty(result1, k))
assert.ok(hasProperty(result3, k))
assert.ok(hasProperty(result3, k))
+ assert.ok(hasProperty(result4, k))
})
assert.ok(isDateString(result1.published))
assert.ok(isDateString(result2.published))
- assert.ok(isDateString(result3.published))
+ assert.ok((isDateString(result3.published)))
+ assert.ok(isDateString(result4.published))
})
})
diff --git a/src/utils/findDate.js b/src/utils/findDate.js
index 3a666e0..9bfbd19 100644
--- a/src/utils/findDate.js
+++ b/src/utils/findDate.js
@@ -1,10 +1,10 @@
/**
- * Convert date format to YYYY-MM-DD
- *
- * @param {string} dateString
- * @returns {string} YYYY-MM-DD
- */
+* Convert date format to YYYY-MM-DD
+*
+* @param {string} dateString
+* @returns {string} YYYY-MM-DD
+*/
function convertDateFormat (dateString) {
const parts = dateString.split('/')
if (parts.length !== 3) return dateString
@@ -22,34 +22,68 @@ function convertDateFormat (dateString) {
}
/**
- * Look for the publication date in the body of the content.
- *
- * @param {Document} document - The HTML Document
- * @returns {string} The date string
- */
-export default function (doc) {
+* @param {string} url
+* @returns {string|undefined}
+*/
+function dateFromUrl (url) {
+ const regex = /\/(\d{4})\/(\d{2})\/(\d{2})(?:[^\d]|$)/
+ const match = url.match(regex)
+
+ if (match) {
+ // eslint-disable-next-line no-unused-vars
+ const [_, year, month, day] = match
+ const dateString = `${year}-${month}-${day}T00:00:00`
+
+ // date validation
+ const date = new Date(dateString)
+ if (date.getFullYear() === parseInt(year) &&
+ date.getMonth() + 1 === parseInt(month) &&
+ date.getDate() === parseInt(day)) {
+ return dateString
+ }
+ }
+
+ return undefined
+}
+
+/**
+* @param {Element} element
+* @returns {string|undefined}
+*/
+function dateFromContent (element) {
const datePatterns = [
/\d{4}-\d{2}-\d{2}/,
/\d{1,2}\/\d{1,2}\/\d{2,4}/,
]
- const findDate = (element) => {
- for (const pattern of datePatterns) {
- const match = element.textContent.match(pattern)
- if (match) return convertDateFormat(match[0])
- }
- return ''
+ for (const pattern of datePatterns) {
+ const match = element.textContent.match(pattern)
+ if (match) return convertDateFormat(match[0])
}
+ return undefined
+}
+
+/**
+* Look for publication date in the body of the content.
+*
+* @param {Document} document - HTML Document
+* @param {Object} metadata - Article metadata
+* @returns {string} Date string
+*/
+export default function (doc, metadata) {
const priorityElements = doc.querySelectorAll('time, [datetime], [itemprop~=datePublished], [itemprop~=dateCreated]')
for (const el of priorityElements) {
- const date = el.getAttribute('datetime') || el.getAttribute('content') || findDate(el)
+ const date = el.getAttribute('datetime') || el.getAttribute('content') || dateFromContent(el)
if (date) return date
}
+ const urlDate = dateFromUrl(metadata.url)
+ if (urlDate) return urlDate
+
const secondaryElements = doc.querySelectorAll('p, span, div')
for (const el of secondaryElements) {
- const date = findDate(el)
+ const date = dateFromContent(el)
if (date) return date
}
diff --git a/test-data/regular-article-date-url.html b/test-data/regular-article-date-url.html
new file mode 100644
index 0000000..3262110
--- /dev/null
+++ b/test-data/regular-article-date-url.html
@@ -0,0 +1,55 @@
+
+
+
+
+
+ Article title here - ArticleParser
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Article title here
+
+ Few can name a
rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.
+
+ Those cheetahs are nothing more than dogs. A watermelon is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.
+ The first fair dog is, in its own way, a lemon.
+ 4746 Kelly Drive, West Virginia
+
+
+
+
+
+
+
+