diff --git a/packages/metascraper-logo-favicon/src/index.js b/packages/metascraper-logo-favicon/src/index.js
index 4ec24630f..98c6f6ad3 100644
--- a/packages/metascraper-logo-favicon/src/index.js
+++ b/packages/metascraper-logo-favicon/src/index.js
@@ -1,14 +1,30 @@
'use strict'
-const { logo, parseUrl, normalizeUrl, toRule } = require('@metascraper/helpers')
const { isEmpty, first, toNumber, chain, orderBy } = require('lodash')
const reachableUrl = require('reachable-url')
const memoize = require('@keyvhq/memoize')
+const {
+ logo,
+ parseUrl,
+ normalizeUrl,
+ toRule,
+ extension
+} = require('@metascraper/helpers')
+
+const ALLOWED_EXTENSION_CONTENT_TYPES = [
+ ['ico', ['image/vnd.microsoft.icon', 'image/x-icon']],
+ ['png', ['image/png']]
+]
+
const SIZE_REGEX_BY_X = /\d+x\d+/
const toLogo = toRule(logo)
+const isValidContenType = (contentType, contentTypes) => {
+ return contentType && contentTypes.some(ct => contentType.includes(ct))
+}
+
const toSize = (input, url) => {
if (isEmpty(input)) return
@@ -85,9 +101,19 @@ const sizeSelectors = [
const firstReachable = async (domNodeSizes, gotOpts) => {
for (const { url } of domNodeSizes) {
const response = await reachableUrl(url, gotOpts)
- if (reachableUrl.isReachable(response)) {
- return response.url
+ if (!reachableUrl.isReachable(response)) continue
+ const contentType = response.headers['content-type']
+
+ const urlExtension = extension(url)
+ const contentTypes = ALLOWED_EXTENSION_CONTENT_TYPES.find(
+ ([ext]) => ext === urlExtension
+ )
+
+ if (contentTypes && !isValidContenType(contentType, contentTypes[1])) {
+ continue
}
+
+ return response.url
}
}
@@ -109,22 +135,16 @@ const pickBiggerSize = async (sizes, { gotOpts } = {}) => {
pickBiggerSize.sortBySize = collection =>
orderBy(collection, ['size.priority'], ['desc'])
-const createFavicon =
- ({ ext, contentTypes }) =>
- async (url, { gotOpts } = {}) => {
- const faviconUrl = logo(`/favicon.${ext}`, { url })
- if (!faviconUrl) return undefined
-
- const response = await reachableUrl(faviconUrl, gotOpts)
- const contentType = response.headers['content-type']
-
- const isValidContenType =
- contentType && contentTypes.some(ct => contentType.includes(ct))
-
- return isValidContenType && reachableUrl.isReachable(response)
- ? response.url
- : undefined
- }
+const createFavicon = ([ext, contentTypes]) => {
+ return async (url, { gotOpts } = {}) => {
+ const faviconUrl = logo(`/favicon.${ext}`, { url })
+ if (!faviconUrl) return undefined
+ const response = await reachableUrl(faviconUrl, gotOpts)
+ if (!reachableUrl.isReachable(response)) return undefined
+ const contentType = response.headers['content-type']
+ return isValidContenType(contentType, contentTypes) && response.url
+ }
+}
const google = async (url, { gotOpts } = {}) => {
const response = await reachableUrl(google.url(url), gotOpts)
@@ -136,19 +156,11 @@ google.url = (url, size = 128) =>
const createGetLogo = ({ withGoogle, withFavicon, gotOpts, keyvOpts }) => {
const getLogo = async url => {
- const providers = [
- withFavicon &&
- createFavicon({
- ext: 'png',
- contentTypes: ['image/png']
- }),
- withFavicon &&
- createFavicon({
- ext: 'ico',
- contentTypes: ['image/vnd.microsoft.icon', 'image/x-icon']
- }),
- withGoogle && google
- ].filter(Boolean)
+ const providers = ALLOWED_EXTENSION_CONTENT_TYPES.map(
+ ext => withFavicon && createFavicon(ext)
+ )
+ .concat(withGoogle && google)
+ .filter(Boolean)
for (const provider of providers) {
const logoUrl = await provider(url, { gotOpts })
diff --git a/packages/metascraper-logo-favicon/test/favicon.js b/packages/metascraper-logo-favicon/test/favicon.js
index 46cdc7019..c77270433 100644
--- a/packages/metascraper-logo-favicon/test/favicon.js
+++ b/packages/metascraper-logo-favicon/test/favicon.js
@@ -6,11 +6,11 @@ const { createFavicon } = require('..')
const { runServer } = require('./helpers')
-const faviconPNG = createFavicon({ ext: 'png', contentTypes: ['image/png'] })
-const faviconICO = createFavicon({
- ext: 'ico',
- contentTypes: ['image/vnd.microsoft.icon', 'image/x-icon']
-})
+const faviconPNG = createFavicon(['png', ['image/png']])
+const faviconICO = createFavicon([
+ 'ico',
+ ['image/vnd.microsoft.icon', 'image/x-icon']
+])
test('return undefined if favicon is not reachable', async t => {
const url = 'https://idontexist.lol'
@@ -36,7 +36,7 @@ test("don't resolve favicon.ico with no valid content-type", async t => {
res.setHeader('content-type', 'image/svg+xml; charset=utf-8')
res.end('')
})
- t.is(await faviconICO(url), undefined)
+ t.is(await faviconICO(url), false)
})
test("favicon.png with 'image/png' content-type", async t => {
diff --git a/packages/metascraper-logo-favicon/test/index.js b/packages/metascraper-logo-favicon/test/index.js
index c71a8da90..2725ad47b 100644
--- a/packages/metascraper-logo-favicon/test/index.js
+++ b/packages/metascraper-logo-favicon/test/index.js
@@ -4,6 +4,8 @@ const { readFile } = require('fs/promises')
const { resolve } = require('path')
const test = require('ava')
+const { runServer } = require('./helpers')
+
const createMetascraper = opts => require('metascraper')([require('..')(opts)])
const createHtml = meta =>
@@ -251,3 +253,68 @@ test('avoid wrong data URI', async t => {
const metadata = await metascraper({ url, html })
t.is(metadata.logo, 'https://www.adobe.com/favicon.ico')
})
+
+test("favicon.ico detected in HTML markup can't be random content-type", async t => {
+ const url = await runServer(t, async ({ res }) => {
+ res.setHeader('content-type', 'image/svg+xml')
+ res.end('')
+ })
+
+ const html =
+ ''
+ const metascraper = createMetascraper()
+ const metadata = await metascraper({ url, html })
+ t.is(metadata.logo, null)
+})
+
+test('favicon.ico detected in HTML markup can be `image/x-icon` content-type', async t => {
+ const url = await runServer(t, async ({ res }) => {
+ res.setHeader('content-type', 'image/x-icon')
+ res.end()
+ })
+
+ const html =
+ ''
+ const metascraper = createMetascraper()
+ const metadata = await metascraper({ url, html })
+ t.is(metadata.logo, `${url}favicon.ico`)
+})
+
+test('favicon.ico detected in HTML markup can be `image/vnd.microsoft.icon` content-type', async t => {
+ const url = await runServer(t, async ({ res }) => {
+ res.setHeader('content-type', 'image/vnd.microsoft.icon')
+ res.end()
+ })
+
+ const html =
+ ''
+ const metascraper = createMetascraper()
+ const metadata = await metascraper({ url, html })
+ t.is(metadata.logo, `${url}favicon.ico`)
+})
+
+test("favicon.png detected in HTML markup can't be random content-type", async t => {
+ const url = await runServer(t, async ({ res }) => {
+ res.setHeader('content-type', 'image/svg+xml')
+ res.end('')
+ })
+
+ const html =
+ ''
+ const metascraper = createMetascraper()
+ const metadata = await metascraper({ url, html })
+ t.is(metadata.logo, null)
+})
+
+test('favicon.png detected in HTML markup can be `image/png` content-type', async t => {
+ const url = await runServer(t, async ({ res }) => {
+ res.setHeader('content-type', 'image/png')
+ res.end()
+ })
+
+ const html =
+ ''
+ const metascraper = createMetascraper()
+ const metadata = await metascraper({ url, html })
+ t.is(metadata.logo, `${url}favicon.png`)
+})
diff --git a/packages/metascraper/test/integration/fast-company/index.js b/packages/metascraper/test/integration/fast-company/index.js
index 549f77e80..b1186f066 100644
--- a/packages/metascraper/test/integration/fast-company/index.js
+++ b/packages/metascraper/test/integration/fast-company/index.js
@@ -26,6 +26,8 @@ const url =
test('fast-company', async t => {
const html = await readFile(resolve(__dirname, 'input.html'))
- const metadata = await metascraper({ html, url })
+ const { logo, ...metadata } = await metascraper({ html, url })
t.snapshot(metadata)
+ t.is(typeof logo, 'string')
+ t.true(new URL(logo).hostname.endsWith('.gstatic.com'), logo)
})
diff --git a/packages/metascraper/test/integration/fast-company/snapshots/index.js.md b/packages/metascraper/test/integration/fast-company/snapshots/index.js.md
index fad51184b..b8566c446 100644
--- a/packages/metascraper/test/integration/fast-company/snapshots/index.js.md
+++ b/packages/metascraper/test/integration/fast-company/snapshots/index.js.md
@@ -15,7 +15,6 @@ Generated by [AVA](https://avajs.dev).
description: 'Lack of access to capital is a big challenge, but so is the lack of access to networks and advisors.',
image: 'http://b.fastcompany.net/multisite_files/fastcompany/imagecache/620x350/poster/2016/05/3060169-poster-p-1-one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business.jpg',
lang: 'en',
- logo: 'https://www.fastcompany.com/favicon.ico',
publisher: 'Fast Company',
title: 'One Of The Biggest Challenges Of Getting Funding For Minority-Owned Business',
url: 'http://www.fastcompany.com/3060169/one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business',
diff --git a/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap b/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap
index 13f39a185..161ce77b4 100644
Binary files a/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap and b/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap differ
diff --git a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md
index 12c58ddd2..239fb9cc9 100644
--- a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md
+++ b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md
@@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev).
description: 'Tech start-up Appthority’s office has plush conference rooms, soundproof phone booths, an enormous kitchen and a view of San Francisco Bay. It has ping-pong and foosball tables, beer on tap and 11 types of tea.',
image: 'http://www.trbimg.com/img-572421a4/turbine/la-fi-tn-tech-downturn-20160429',
lang: 'en',
- logo: 'http://www.trbas.com/jive/prod/common/images/lanews-apple-touch-icon.1q2w3_9ffdb679907f116af126c65ff1edb27a.png',
+ logo: 'https://www.latimes.com/favicon.ico',
publisher: 'latimes.com',
title: 'As venture capital dries up, tech start-ups discover frugality',
url: 'http://www.latimes.com/business/technology/la-fi-tn-tech-downturn-20160429-story.html',
diff --git a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap
index a9a6c11ad..be3cc5706 100644
Binary files a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap and b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap differ
diff --git a/packages/metascraper/test/integration/stuff/snapshots/index.js.md b/packages/metascraper/test/integration/stuff/snapshots/index.js.md
index e20b31694..4d3a017e6 100644
--- a/packages/metascraper/test/integration/stuff/snapshots/index.js.md
+++ b/packages/metascraper/test/integration/stuff/snapshots/index.js.md
@@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev).
description: 'Orphee Mickalad is on track to replace his former history teacher Tangi Utikere on Palmerston North City Council.',
image: 'https://resources.stuff.co.nz/content/dam/images/4/y/p/h/8/h/image.related.StuffLandscapeSixteenByNine.1420x800.4yr12n.png/1613526047477.jpg',
lang: 'en',
- logo: 'https://www.stuff.co.nz/sics-assets/images/favicons/apple-touch-icon.png',
+ logo: 'https://www.stuff.co.nz/sics-assets/images/favicons/safari-pinned-tab.svg',
publisher: 'Stuff',
title: 'Orphee Mickalad leading Palmerston North by-election',
url: 'https://www.stuff.co.nz/manawatu-standard/news/300232751/orphee-mickalad-leading-palmerston-north-byelection',
diff --git a/packages/metascraper/test/integration/stuff/snapshots/index.js.snap b/packages/metascraper/test/integration/stuff/snapshots/index.js.snap
index 6cd452150..490501516 100644
Binary files a/packages/metascraper/test/integration/stuff/snapshots/index.js.snap and b/packages/metascraper/test/integration/stuff/snapshots/index.js.snap differ
diff --git a/packages/metascraper/test/integration/substack/snapshots/index.js.md b/packages/metascraper/test/integration/substack/snapshots/index.js.md
index a28a83643..06289385f 100644
--- a/packages/metascraper/test/integration/substack/snapshots/index.js.md
+++ b/packages/metascraper/test/integration/substack/snapshots/index.js.md
@@ -14,7 +14,7 @@ Generated by [AVA](https://avajs.dev).
description: 'The world is a very malleable place. When I read biographies, early lives leap out the most. Leonardo da Vinci was a studio apprentice to Verrocchio at 14. Walt Disney took on a number of jobs, chiefly delivering papers, from 11 years old. Vladimir Nabokov published his first book (a collection of poems) at 16, while still in school. Andrew Carnegie',
image: 'https://substackcdn.com/image/fetch/w_1200,h_600,c_fill,f_jpg,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fef3bd0df-b9fa-4358-afee-116c23f4c55f_2560x1902.jpeg',
lang: 'en',
- logo: 'https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1115e358-65d9-4f1c-872a-f1ea44965132%2Fapple-touch-icon-1024x1024.png',
+ logo: 'https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://simonsarris.substack.com/p/the-most-precious-resource-is-agency&size=128',
publisher: 'The Map is Mostly Water',
title: 'The Most Precious Resource is Agency',
url: 'https://map.simonsarris.com/p/the-most-precious-resource-is-agency',
diff --git a/packages/metascraper/test/integration/substack/snapshots/index.js.snap b/packages/metascraper/test/integration/substack/snapshots/index.js.snap
index 62113baa8..8988b1a95 100644
Binary files a/packages/metascraper/test/integration/substack/snapshots/index.js.snap and b/packages/metascraper/test/integration/substack/snapshots/index.js.snap differ
diff --git a/packages/metascraper/test/integration/therams/index.js b/packages/metascraper/test/integration/therams/index.js
index 0f435961f..e3ace8ca2 100644
--- a/packages/metascraper/test/integration/therams/index.js
+++ b/packages/metascraper/test/integration/therams/index.js
@@ -26,6 +26,5 @@ const url =
test('therams', async t => {
const html = await readFile(resolve(__dirname, 'input.html'))
const metadata = await metascraper({ html, url })
- console.log(metadata)
t.snapshot(metadata)
})
diff --git a/packages/metascraper/test/integration/wsj/snapshots/index.js.md b/packages/metascraper/test/integration/wsj/snapshots/index.js.md
index 6125b5815..ffb86fe1f 100644
--- a/packages/metascraper/test/integration/wsj/snapshots/index.js.md
+++ b/packages/metascraper/test/integration/wsj/snapshots/index.js.md
@@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev).
description: 'Funding Snapshot:',
image: 'http://si.wsj.net/img/WSJ_Logo_black_social.gif',
lang: 'en',
- logo: 'http://s.wsj.net/media/wsj-pro-favicon.ico',
+ logo: 'https://www.wsj.com/apple-touch-icon.png',
publisher: 'WSJ',
title: 'Funding Snapshot: Software Development Platform CircleCI Raises $18M',
url: 'http://www.wsj.com/articles/funding-snapshot-software-development-platform-circleci-raises-18m-1463398202',
diff --git a/packages/metascraper/test/integration/wsj/snapshots/index.js.snap b/packages/metascraper/test/integration/wsj/snapshots/index.js.snap
index 743b7295c..2f6e18ac9 100644
Binary files a/packages/metascraper/test/integration/wsj/snapshots/index.js.snap and b/packages/metascraper/test/integration/wsj/snapshots/index.js.snap differ