Skip to content

Commit

Permalink
Merge pull request #709 from microlinkhq/favicon
Browse files Browse the repository at this point in the history
fix: ensure favicon detect in markup is expected content-type
  • Loading branch information
Kikobeats authored Jun 17, 2024
2 parents dba98fa + e366887 commit 60079a0
Show file tree
Hide file tree
Showing 15 changed files with 124 additions and 45 deletions.
76 changes: 44 additions & 32 deletions packages/metascraper-logo-favicon/src/index.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
'use strict'

const { logo, parseUrl, normalizeUrl, toRule } = require('@metascraper/helpers')
const { isEmpty, first, toNumber, chain, orderBy } = require('lodash')
const reachableUrl = require('reachable-url')
const memoize = require('@keyvhq/memoize')

const {
logo,
parseUrl,
normalizeUrl,
toRule,
extension
} = require('@metascraper/helpers')

const ALLOWED_EXTENSION_CONTENT_TYPES = [
['ico', ['image/vnd.microsoft.icon', 'image/x-icon']],
['png', ['image/png']]
]

const SIZE_REGEX_BY_X = /\d+x\d+/

const toLogo = toRule(logo)

const isValidContenType = (contentType, contentTypes) => {
return contentType && contentTypes.some(ct => contentType.includes(ct))
}

const toSize = (input, url) => {
if (isEmpty(input)) return

Expand Down Expand Up @@ -85,9 +101,19 @@ const sizeSelectors = [
const firstReachable = async (domNodeSizes, gotOpts) => {
for (const { url } of domNodeSizes) {
const response = await reachableUrl(url, gotOpts)
if (reachableUrl.isReachable(response)) {
return response.url
if (!reachableUrl.isReachable(response)) continue
const contentType = response.headers['content-type']

const urlExtension = extension(url)
const contentTypes = ALLOWED_EXTENSION_CONTENT_TYPES.find(
([ext]) => ext === urlExtension
)

if (contentTypes && !isValidContenType(contentType, contentTypes[1])) {
continue
}

return response.url
}
}

Expand All @@ -109,22 +135,16 @@ const pickBiggerSize = async (sizes, { gotOpts } = {}) => {
pickBiggerSize.sortBySize = collection =>
orderBy(collection, ['size.priority'], ['desc'])

const createFavicon =
({ ext, contentTypes }) =>
async (url, { gotOpts } = {}) => {
const faviconUrl = logo(`/favicon.${ext}`, { url })
if (!faviconUrl) return undefined

const response = await reachableUrl(faviconUrl, gotOpts)
const contentType = response.headers['content-type']

const isValidContenType =
contentType && contentTypes.some(ct => contentType.includes(ct))

return isValidContenType && reachableUrl.isReachable(response)
? response.url
: undefined
}
const createFavicon = ([ext, contentTypes]) => {
return async (url, { gotOpts } = {}) => {
const faviconUrl = logo(`/favicon.${ext}`, { url })
if (!faviconUrl) return undefined
const response = await reachableUrl(faviconUrl, gotOpts)
if (!reachableUrl.isReachable(response)) return undefined
const contentType = response.headers['content-type']
return isValidContenType(contentType, contentTypes) && response.url
}
}

const google = async (url, { gotOpts } = {}) => {
const response = await reachableUrl(google.url(url), gotOpts)
Expand All @@ -136,19 +156,11 @@ google.url = (url, size = 128) =>

const createGetLogo = ({ withGoogle, withFavicon, gotOpts, keyvOpts }) => {
const getLogo = async url => {
const providers = [
withFavicon &&
createFavicon({
ext: 'png',
contentTypes: ['image/png']
}),
withFavicon &&
createFavicon({
ext: 'ico',
contentTypes: ['image/vnd.microsoft.icon', 'image/x-icon']
}),
withGoogle && google
].filter(Boolean)
const providers = ALLOWED_EXTENSION_CONTENT_TYPES.map(
ext => withFavicon && createFavicon(ext)
)
.concat(withGoogle && google)
.filter(Boolean)

for (const provider of providers) {
const logoUrl = await provider(url, { gotOpts })
Expand Down
12 changes: 6 additions & 6 deletions packages/metascraper-logo-favicon/test/favicon.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ const { createFavicon } = require('..')

const { runServer } = require('./helpers')

const faviconPNG = createFavicon({ ext: 'png', contentTypes: ['image/png'] })
const faviconICO = createFavicon({
ext: 'ico',
contentTypes: ['image/vnd.microsoft.icon', 'image/x-icon']
})
const faviconPNG = createFavicon(['png', ['image/png']])
const faviconICO = createFavicon([
'ico',
['image/vnd.microsoft.icon', 'image/x-icon']
])

test('return undefined if favicon is not reachable', async t => {
const url = 'https://idontexist.lol'
Expand All @@ -36,7 +36,7 @@ test("don't resolve favicon.ico with no valid content-type", async t => {
res.setHeader('content-type', 'image/svg+xml; charset=utf-8')
res.end('<svg></svg>')
})
t.is(await faviconICO(url), undefined)
t.is(await faviconICO(url), false)
})

test("favicon.png with 'image/png' content-type", async t => {
Expand Down
67 changes: 67 additions & 0 deletions packages/metascraper-logo-favicon/test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ const { readFile } = require('fs/promises')
const { resolve } = require('path')
const test = require('ava')

const { runServer } = require('./helpers')

const createMetascraper = opts => require('metascraper')([require('..')(opts)])

const createHtml = meta =>
Expand Down Expand Up @@ -251,3 +253,68 @@ test('avoid wrong data URI', async t => {
const metadata = await metascraper({ url, html })
t.is(metadata.logo, 'https://www.adobe.com/favicon.ico')
})

test("favicon.ico detected in HTML markup can't be random content-type", async t => {
const url = await runServer(t, async ({ res }) => {
res.setHeader('content-type', 'image/svg+xml')
res.end('<svg></svg>')
})

const html =
'<link rel="icon" href="/favicon.ico" type="image/x-icon" sizes="120x116">'
const metascraper = createMetascraper()
const metadata = await metascraper({ url, html })
t.is(metadata.logo, null)
})

test('favicon.ico detected in HTML markup can be `image/x-icon` content-type', async t => {
const url = await runServer(t, async ({ res }) => {
res.setHeader('content-type', 'image/x-icon')
res.end()
})

const html =
'<link rel="icon" href="/favicon.ico" type="image/x-icon" sizes="120x116">'
const metascraper = createMetascraper()
const metadata = await metascraper({ url, html })
t.is(metadata.logo, `${url}favicon.ico`)
})

test('favicon.ico detected in HTML markup can be `image/vnd.microsoft.icon` content-type', async t => {
const url = await runServer(t, async ({ res }) => {
res.setHeader('content-type', 'image/vnd.microsoft.icon')
res.end()
})

const html =
'<link rel="icon" href="/favicon.ico" type="image/x-icon" sizes="120x116">'
const metascraper = createMetascraper()
const metadata = await metascraper({ url, html })
t.is(metadata.logo, `${url}favicon.ico`)
})

test("favicon.png detected in HTML markup can't be random content-type", async t => {
const url = await runServer(t, async ({ res }) => {
res.setHeader('content-type', 'image/svg+xml')
res.end('<svg></svg>')
})

const html =
'<link rel="icon" href="/favicon.png" type="image/x-icon" sizes="120x116">'
const metascraper = createMetascraper()
const metadata = await metascraper({ url, html })
t.is(metadata.logo, null)
})

test('favicon.png detected in HTML markup can be `image/png` content-type', async t => {
const url = await runServer(t, async ({ res }) => {
res.setHeader('content-type', 'image/png')
res.end()
})

const html =
'<link rel="icon" href="/favicon.png" type="image/x-icon" sizes="120x116">'
const metascraper = createMetascraper()
const metadata = await metascraper({ url, html })
t.is(metadata.logo, `${url}favicon.png`)
})
4 changes: 3 additions & 1 deletion packages/metascraper/test/integration/fast-company/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ const url =

test('fast-company', async t => {
const html = await readFile(resolve(__dirname, 'input.html'))
const metadata = await metascraper({ html, url })
const { logo, ...metadata } = await metascraper({ html, url })
t.snapshot(metadata)
t.is(typeof logo, 'string')
t.true(new URL(logo).hostname.endsWith('.gstatic.com'), logo)
})
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ Generated by [AVA](https://avajs.dev).
description: 'Lack of access to capital is a big challenge, but so is the lack of access to networks and advisors.',
image: 'http://b.fastcompany.net/multisite_files/fastcompany/imagecache/620x350/poster/2016/05/3060169-poster-p-1-one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business.jpg',
lang: 'en',
logo: 'https://www.fastcompany.com/favicon.ico',
publisher: 'Fast Company',
title: 'One Of The Biggest Challenges Of Getting Funding For Minority-Owned Business',
url: 'http://www.fastcompany.com/3060169/one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev).
description: 'Tech start-up Appthority’s office has plush conference rooms, soundproof phone booths, an enormous kitchen and a view of San Francisco Bay. It has ping-pong and foosball tables, beer on tap and 11 types of tea.',
image: 'http://www.trbimg.com/img-572421a4/turbine/la-fi-tn-tech-downturn-20160429',
lang: 'en',
logo: 'http://www.trbas.com/jive/prod/common/images/lanews-apple-touch-icon.1q2w3_9ffdb679907f116af126c65ff1edb27a.png',
logo: 'https://www.latimes.com/favicon.ico',
publisher: 'latimes.com',
title: 'As venture capital dries up, tech start-ups discover frugality',
url: 'http://www.latimes.com/business/technology/la-fi-tn-tech-downturn-20160429-story.html',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev).
description: 'Orphee Mickalad is on track to replace his former history teacher Tangi Utikere on Palmerston North City Council.',
image: 'https://resources.stuff.co.nz/content/dam/images/4/y/p/h/8/h/image.related.StuffLandscapeSixteenByNine.1420x800.4yr12n.png/1613526047477.jpg',
lang: 'en',
logo: 'https://www.stuff.co.nz/sics-assets/images/favicons/apple-touch-icon.png',
logo: 'https://www.stuff.co.nz/sics-assets/images/favicons/safari-pinned-tab.svg',
publisher: 'Stuff',
title: 'Orphee Mickalad leading Palmerston North by-election',
url: 'https://www.stuff.co.nz/manawatu-standard/news/300232751/orphee-mickalad-leading-palmerston-north-byelection',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Generated by [AVA](https://avajs.dev).
description: 'The world is a very malleable place. When I read biographies, early lives leap out the most. Leonardo da Vinci was a studio apprentice to Verrocchio at 14. Walt Disney took on a number of jobs, chiefly delivering papers, from 11 years old. Vladimir Nabokov published his first book (a collection of poems) at 16, while still in school. Andrew Carnegie',
image: 'https://substackcdn.com/image/fetch/w_1200,h_600,c_fill,f_jpg,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fef3bd0df-b9fa-4358-afee-116c23f4c55f_2560x1902.jpeg',
lang: 'en',
logo: 'https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1115e358-65d9-4f1c-872a-f1ea44965132%2Fapple-touch-icon-1024x1024.png',
logo: 'https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://simonsarris.substack.com/p/the-most-precious-resource-is-agency&size=128',
publisher: 'The Map is Mostly Water',
title: 'The Most Precious Resource is Agency',
url: 'https://map.simonsarris.com/p/the-most-precious-resource-is-agency',
Expand Down
Binary file not shown.
1 change: 0 additions & 1 deletion packages/metascraper/test/integration/therams/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,5 @@ const url =
test('therams', async t => {
const html = await readFile(resolve(__dirname, 'input.html'))
const metadata = await metascraper({ html, url })
console.log(metadata)
t.snapshot(metadata)
})
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev).
description: 'Funding Snapshot:',
image: 'http://si.wsj.net/img/WSJ_Logo_black_social.gif',
lang: 'en',
logo: 'http://s.wsj.net/media/wsj-pro-favicon.ico',
logo: 'https://www.wsj.com/apple-touch-icon.png',
publisher: 'WSJ',
title: 'Funding Snapshot: Software Development Platform CircleCI Raises $18M',
url: 'http://www.wsj.com/articles/funding-snapshot-software-development-platform-circleci-raises-18m-1463398202',
Expand Down
Binary file not shown.

0 comments on commit 60079a0

Please sign in to comment.