diff --git a/packages/metascraper-logo-favicon/src/index.js b/packages/metascraper-logo-favicon/src/index.js index 4ec24630f..98c6f6ad3 100644 --- a/packages/metascraper-logo-favicon/src/index.js +++ b/packages/metascraper-logo-favicon/src/index.js @@ -1,14 +1,30 @@ 'use strict' -const { logo, parseUrl, normalizeUrl, toRule } = require('@metascraper/helpers') const { isEmpty, first, toNumber, chain, orderBy } = require('lodash') const reachableUrl = require('reachable-url') const memoize = require('@keyvhq/memoize') +const { + logo, + parseUrl, + normalizeUrl, + toRule, + extension +} = require('@metascraper/helpers') + +const ALLOWED_EXTENSION_CONTENT_TYPES = [ + ['ico', ['image/vnd.microsoft.icon', 'image/x-icon']], + ['png', ['image/png']] +] + const SIZE_REGEX_BY_X = /\d+x\d+/ const toLogo = toRule(logo) +const isValidContenType = (contentType, contentTypes) => { + return contentType && contentTypes.some(ct => contentType.includes(ct)) +} + const toSize = (input, url) => { if (isEmpty(input)) return @@ -85,9 +101,19 @@ const sizeSelectors = [ const firstReachable = async (domNodeSizes, gotOpts) => { for (const { url } of domNodeSizes) { const response = await reachableUrl(url, gotOpts) - if (reachableUrl.isReachable(response)) { - return response.url + if (!reachableUrl.isReachable(response)) continue + const contentType = response.headers['content-type'] + + const urlExtension = extension(url) + const contentTypes = ALLOWED_EXTENSION_CONTENT_TYPES.find( + ([ext]) => ext === urlExtension + ) + + if (contentTypes && !isValidContenType(contentType, contentTypes[1])) { + continue } + + return response.url } } @@ -109,22 +135,16 @@ const pickBiggerSize = async (sizes, { gotOpts } = {}) => { pickBiggerSize.sortBySize = collection => orderBy(collection, ['size.priority'], ['desc']) -const createFavicon = - ({ ext, contentTypes }) => - async (url, { gotOpts } = {}) => { - const faviconUrl = logo(`/favicon.${ext}`, { url }) - if (!faviconUrl) return undefined - - const response = await reachableUrl(faviconUrl, gotOpts) - const contentType = response.headers['content-type'] - - const isValidContenType = - contentType && contentTypes.some(ct => contentType.includes(ct)) - - return isValidContenType && reachableUrl.isReachable(response) - ? response.url - : undefined - } +const createFavicon = ([ext, contentTypes]) => { + return async (url, { gotOpts } = {}) => { + const faviconUrl = logo(`/favicon.${ext}`, { url }) + if (!faviconUrl) return undefined + const response = await reachableUrl(faviconUrl, gotOpts) + if (!reachableUrl.isReachable(response)) return undefined + const contentType = response.headers['content-type'] + return isValidContenType(contentType, contentTypes) && response.url + } +} const google = async (url, { gotOpts } = {}) => { const response = await reachableUrl(google.url(url), gotOpts) @@ -136,19 +156,11 @@ google.url = (url, size = 128) => const createGetLogo = ({ withGoogle, withFavicon, gotOpts, keyvOpts }) => { const getLogo = async url => { - const providers = [ - withFavicon && - createFavicon({ - ext: 'png', - contentTypes: ['image/png'] - }), - withFavicon && - createFavicon({ - ext: 'ico', - contentTypes: ['image/vnd.microsoft.icon', 'image/x-icon'] - }), - withGoogle && google - ].filter(Boolean) + const providers = ALLOWED_EXTENSION_CONTENT_TYPES.map( + ext => withFavicon && createFavicon(ext) + ) + .concat(withGoogle && google) + .filter(Boolean) for (const provider of providers) { const logoUrl = await provider(url, { gotOpts }) diff --git a/packages/metascraper-logo-favicon/test/favicon.js b/packages/metascraper-logo-favicon/test/favicon.js index 46cdc7019..c77270433 100644 --- a/packages/metascraper-logo-favicon/test/favicon.js +++ b/packages/metascraper-logo-favicon/test/favicon.js @@ -6,11 +6,11 @@ const { createFavicon } = require('..') const { runServer } = require('./helpers') -const faviconPNG = createFavicon({ ext: 'png', contentTypes: ['image/png'] }) -const faviconICO = createFavicon({ - ext: 'ico', - contentTypes: ['image/vnd.microsoft.icon', 'image/x-icon'] -}) +const faviconPNG = createFavicon(['png', ['image/png']]) +const faviconICO = createFavicon([ + 'ico', + ['image/vnd.microsoft.icon', 'image/x-icon'] +]) test('return undefined if favicon is not reachable', async t => { const url = 'https://idontexist.lol' @@ -36,7 +36,7 @@ test("don't resolve favicon.ico with no valid content-type", async t => { res.setHeader('content-type', 'image/svg+xml; charset=utf-8') res.end('') }) - t.is(await faviconICO(url), undefined) + t.is(await faviconICO(url), false) }) test("favicon.png with 'image/png' content-type", async t => { diff --git a/packages/metascraper-logo-favicon/test/index.js b/packages/metascraper-logo-favicon/test/index.js index c71a8da90..2725ad47b 100644 --- a/packages/metascraper-logo-favicon/test/index.js +++ b/packages/metascraper-logo-favicon/test/index.js @@ -4,6 +4,8 @@ const { readFile } = require('fs/promises') const { resolve } = require('path') const test = require('ava') +const { runServer } = require('./helpers') + const createMetascraper = opts => require('metascraper')([require('..')(opts)]) const createHtml = meta => @@ -251,3 +253,68 @@ test('avoid wrong data URI', async t => { const metadata = await metascraper({ url, html }) t.is(metadata.logo, 'https://www.adobe.com/favicon.ico') }) + +test("favicon.ico detected in HTML markup can't be random content-type", async t => { + const url = await runServer(t, async ({ res }) => { + res.setHeader('content-type', 'image/svg+xml') + res.end('') + }) + + const html = + '' + const metascraper = createMetascraper() + const metadata = await metascraper({ url, html }) + t.is(metadata.logo, null) +}) + +test('favicon.ico detected in HTML markup can be `image/x-icon` content-type', async t => { + const url = await runServer(t, async ({ res }) => { + res.setHeader('content-type', 'image/x-icon') + res.end() + }) + + const html = + '' + const metascraper = createMetascraper() + const metadata = await metascraper({ url, html }) + t.is(metadata.logo, `${url}favicon.ico`) +}) + +test('favicon.ico detected in HTML markup can be `image/vnd.microsoft.icon` content-type', async t => { + const url = await runServer(t, async ({ res }) => { + res.setHeader('content-type', 'image/vnd.microsoft.icon') + res.end() + }) + + const html = + '' + const metascraper = createMetascraper() + const metadata = await metascraper({ url, html }) + t.is(metadata.logo, `${url}favicon.ico`) +}) + +test("favicon.png detected in HTML markup can't be random content-type", async t => { + const url = await runServer(t, async ({ res }) => { + res.setHeader('content-type', 'image/svg+xml') + res.end('') + }) + + const html = + '' + const metascraper = createMetascraper() + const metadata = await metascraper({ url, html }) + t.is(metadata.logo, null) +}) + +test('favicon.png detected in HTML markup can be `image/png` content-type', async t => { + const url = await runServer(t, async ({ res }) => { + res.setHeader('content-type', 'image/png') + res.end() + }) + + const html = + '' + const metascraper = createMetascraper() + const metadata = await metascraper({ url, html }) + t.is(metadata.logo, `${url}favicon.png`) +}) diff --git a/packages/metascraper/test/integration/fast-company/index.js b/packages/metascraper/test/integration/fast-company/index.js index 549f77e80..b1186f066 100644 --- a/packages/metascraper/test/integration/fast-company/index.js +++ b/packages/metascraper/test/integration/fast-company/index.js @@ -26,6 +26,8 @@ const url = test('fast-company', async t => { const html = await readFile(resolve(__dirname, 'input.html')) - const metadata = await metascraper({ html, url }) + const { logo, ...metadata } = await metascraper({ html, url }) t.snapshot(metadata) + t.is(typeof logo, 'string') + t.true(new URL(logo).hostname.endsWith('.gstatic.com'), logo) }) diff --git a/packages/metascraper/test/integration/fast-company/snapshots/index.js.md b/packages/metascraper/test/integration/fast-company/snapshots/index.js.md index fad51184b..b8566c446 100644 --- a/packages/metascraper/test/integration/fast-company/snapshots/index.js.md +++ b/packages/metascraper/test/integration/fast-company/snapshots/index.js.md @@ -15,7 +15,6 @@ Generated by [AVA](https://avajs.dev). description: 'Lack of access to capital is a big challenge, but so is the lack of access to networks and advisors.', image: 'http://b.fastcompany.net/multisite_files/fastcompany/imagecache/620x350/poster/2016/05/3060169-poster-p-1-one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business.jpg', lang: 'en', - logo: 'https://www.fastcompany.com/favicon.ico', publisher: 'Fast Company', title: 'One Of The Biggest Challenges Of Getting Funding For Minority-Owned Business', url: 'http://www.fastcompany.com/3060169/one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business', diff --git a/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap b/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap index 13f39a185..161ce77b4 100644 Binary files a/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap and b/packages/metascraper/test/integration/fast-company/snapshots/index.js.snap differ diff --git a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md index 12c58ddd2..239fb9cc9 100644 --- a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md +++ b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.md @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev). description: 'Tech start-up Appthority’s office has plush conference rooms, soundproof phone booths, an enormous kitchen and a view of San Francisco Bay. It has ping-pong and foosball tables, beer on tap and 11 types of tea.', image: 'http://www.trbimg.com/img-572421a4/turbine/la-fi-tn-tech-downturn-20160429', lang: 'en', - logo: 'http://www.trbas.com/jive/prod/common/images/lanews-apple-touch-icon.1q2w3_9ffdb679907f116af126c65ff1edb27a.png', + logo: 'https://www.latimes.com/favicon.ico', publisher: 'latimes.com', title: 'As venture capital dries up, tech start-ups discover frugality', url: 'http://www.latimes.com/business/technology/la-fi-tn-tech-downturn-20160429-story.html', diff --git a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap index a9a6c11ad..be3cc5706 100644 Binary files a/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap and b/packages/metascraper/test/integration/los-angeles-times/snapshots/index.js.snap differ diff --git a/packages/metascraper/test/integration/stuff/snapshots/index.js.md b/packages/metascraper/test/integration/stuff/snapshots/index.js.md index e20b31694..4d3a017e6 100644 --- a/packages/metascraper/test/integration/stuff/snapshots/index.js.md +++ b/packages/metascraper/test/integration/stuff/snapshots/index.js.md @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev). description: 'Orphee Mickalad is on track to replace his former history teacher Tangi Utikere on Palmerston North City Council.', image: 'https://resources.stuff.co.nz/content/dam/images/4/y/p/h/8/h/image.related.StuffLandscapeSixteenByNine.1420x800.4yr12n.png/1613526047477.jpg', lang: 'en', - logo: 'https://www.stuff.co.nz/sics-assets/images/favicons/apple-touch-icon.png', + logo: 'https://www.stuff.co.nz/sics-assets/images/favicons/safari-pinned-tab.svg', publisher: 'Stuff', title: 'Orphee Mickalad leading Palmerston North by-election', url: 'https://www.stuff.co.nz/manawatu-standard/news/300232751/orphee-mickalad-leading-palmerston-north-byelection', diff --git a/packages/metascraper/test/integration/stuff/snapshots/index.js.snap b/packages/metascraper/test/integration/stuff/snapshots/index.js.snap index 6cd452150..490501516 100644 Binary files a/packages/metascraper/test/integration/stuff/snapshots/index.js.snap and b/packages/metascraper/test/integration/stuff/snapshots/index.js.snap differ diff --git a/packages/metascraper/test/integration/substack/snapshots/index.js.md b/packages/metascraper/test/integration/substack/snapshots/index.js.md index a28a83643..06289385f 100644 --- a/packages/metascraper/test/integration/substack/snapshots/index.js.md +++ b/packages/metascraper/test/integration/substack/snapshots/index.js.md @@ -14,7 +14,7 @@ Generated by [AVA](https://avajs.dev). description: 'The world is a very malleable place. When I read biographies, early lives leap out the most. Leonardo da Vinci was a studio apprentice to Verrocchio at 14. Walt Disney took on a number of jobs, chiefly delivering papers, from 11 years old. Vladimir Nabokov published his first book (a collection of poems) at 16, while still in school. Andrew Carnegie', image: 'https://substackcdn.com/image/fetch/w_1200,h_600,c_fill,f_jpg,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fef3bd0df-b9fa-4358-afee-116c23f4c55f_2560x1902.jpeg', lang: 'en', - logo: 'https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1115e358-65d9-4f1c-872a-f1ea44965132%2Fapple-touch-icon-1024x1024.png', + logo: 'https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://simonsarris.substack.com/p/the-most-precious-resource-is-agency&size=128', publisher: 'The Map is Mostly Water', title: 'The Most Precious Resource is Agency', url: 'https://map.simonsarris.com/p/the-most-precious-resource-is-agency', diff --git a/packages/metascraper/test/integration/substack/snapshots/index.js.snap b/packages/metascraper/test/integration/substack/snapshots/index.js.snap index 62113baa8..8988b1a95 100644 Binary files a/packages/metascraper/test/integration/substack/snapshots/index.js.snap and b/packages/metascraper/test/integration/substack/snapshots/index.js.snap differ diff --git a/packages/metascraper/test/integration/therams/index.js b/packages/metascraper/test/integration/therams/index.js index 0f435961f..e3ace8ca2 100644 --- a/packages/metascraper/test/integration/therams/index.js +++ b/packages/metascraper/test/integration/therams/index.js @@ -26,6 +26,5 @@ const url = test('therams', async t => { const html = await readFile(resolve(__dirname, 'input.html')) const metadata = await metascraper({ html, url }) - console.log(metadata) t.snapshot(metadata) }) diff --git a/packages/metascraper/test/integration/wsj/snapshots/index.js.md b/packages/metascraper/test/integration/wsj/snapshots/index.js.md index 6125b5815..ffb86fe1f 100644 --- a/packages/metascraper/test/integration/wsj/snapshots/index.js.md +++ b/packages/metascraper/test/integration/wsj/snapshots/index.js.md @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev). description: 'Funding Snapshot:', image: 'http://si.wsj.net/img/WSJ_Logo_black_social.gif', lang: 'en', - logo: 'http://s.wsj.net/media/wsj-pro-favicon.ico', + logo: 'https://www.wsj.com/apple-touch-icon.png', publisher: 'WSJ', title: 'Funding Snapshot: Software Development Platform CircleCI Raises $18M', url: 'http://www.wsj.com/articles/funding-snapshot-software-development-platform-circleci-raises-18m-1463398202', diff --git a/packages/metascraper/test/integration/wsj/snapshots/index.js.snap b/packages/metascraper/test/integration/wsj/snapshots/index.js.snap index 743b7295c..2f6e18ac9 100644 Binary files a/packages/metascraper/test/integration/wsj/snapshots/index.js.snap and b/packages/metascraper/test/integration/wsj/snapshots/index.js.snap differ