Add imdb2torrent files from deflix-stremio v0.11.0

https://github.com/doingodswork/deflix-stremio Revision 254d14c6f8ccc21338001da83299a17a9f6c4b29
Deflix-tv · Jan 17, 2021 · 86fc251 · 86fc251
1 parent aca8442
commit 86fc251
Show file tree

Hide file tree

Showing 10 changed files with 1,577 additions and 0 deletions.
diff --git a/1337x.go b/1337x.go
@@ -0,0 +1,303 @@
+package imdb2torrent
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"net/url"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+	"go.uber.org/zap"
+)
+
+type LeetxClientOptions struct {
+	BaseURL  string
+	Timeout  time.Duration
+	CacheAge time.Duration
+}
+
+func NewLeetxClientOpts(baseURL string, timeout, cacheAge time.Duration) LeetxClientOptions {
+	return LeetxClientOptions{
+		BaseURL:  baseURL,
+		Timeout:  timeout,
+		CacheAge: cacheAge,
+	}
+}
+
+var DefaultLeetxClientOpts = LeetxClientOptions{
+	BaseURL:  "https://1337x.to",
+	Timeout:  5 * time.Second,
+	CacheAge: 24 * time.Hour,
+}
+
+var _ MagnetSearcher = (*leetxClient)(nil)
+
+type leetxClient struct {
+	baseURL          string
+	httpClient       *http.Client
+	cache            Cache
+	metaGetter       MetaGetter
+	cacheAge         time.Duration
+	logger           *zap.Logger
+	logFoundTorrents bool
+}
+
+func NewLeetxClient(opts LeetxClientOptions, cache Cache, metaGetter MetaGetter, logger *zap.Logger, logFoundTorrents bool) *leetxClient {
+	return &leetxClient{
+		baseURL: opts.BaseURL,
+		httpClient: &http.Client{
+			Timeout: opts.Timeout,
+		},
+		cache:            cache,
+		metaGetter:       metaGetter,
+		cacheAge:         opts.CacheAge,
+		logger:           logger,
+		logFoundTorrents: logFoundTorrents,
+	}
+}
+
+// FindMovie scrapes 1337x to find torrents for the given IMDb ID.
+// It uses the Stremio Cinemeta remote addon to get a movie name for a given IMDb ID, so it can search 1337x with the name.
+// If no error occured, but there are just no torrents for the movie yet, an empty result and *no* error are returned.
+func (c *leetxClient) FindMovie(ctx context.Context, imdbID string) ([]Result, error) {
+	// Get movie name
+	meta, err := c.metaGetter.GetMovieSimple(ctx, imdbID)
+	if err != nil {
+		return nil, fmt.Errorf("Couldn't get movie name via Cinemeta for IMDb ID %v: %v", imdbID, err)
+	}
+	movieSearch := meta.Title
+	if meta.Year != 0 {
+		movieSearch += " " + strconv.Itoa(meta.Year)
+	}
+	movieSearch = url.PathEscape(movieSearch)
+
+	urlPath := "category-search/" + movieSearch + "/Movies/1/"
+
+	return c.find(ctx, imdbID, urlPath, meta.Title, false)
+}
+
+// FindTVShow scrapes 1337x to find torrents for the given IMDb ID + season + episode.
+// It uses the Stremio Cinemeta remote addon to get a TV show name for a given IMDb ID, so it can search 1337x with the name.
+// If no error occured, but there are just no torrents for the TV show yet, an empty result and *no* error are returned.
+func (c *leetxClient) FindTVShow(ctx context.Context, imdbID string, season, episode int) ([]Result, error) {
+	id := imdbID + ":" + strconv.Itoa(season) + ":" + strconv.Itoa(episode)
+	meta, err := c.metaGetter.GetTVShowSimple(ctx, imdbID, season, episode)
+	if err != nil {
+		return nil, fmt.Errorf("Couldn't get TV show title via Cinemeta for ID %v: %v", id, err)
+	}
+	tvShowSearch, err := createTVShowSearch(ctx, c.metaGetter, imdbID, season, episode)
+	if err != nil {
+		return nil, err
+	}
+	tvShowSearch = url.PathEscape(tvShowSearch)
+
+	urlPath := "category-search/" + tvShowSearch + "/TV/1/"
+
+	return c.find(ctx, id, urlPath, meta.Title, true)
+}
+
+func (c *leetxClient) find(ctx context.Context, id, urlPath, title string, isTVShow bool) ([]Result, error) {
+	zapFieldID := zap.String("id", id)
+	zapFieldTorrentSite := zap.String("torrentSite", "1337x")
+
+	// Check cache first
+	cacheKey := id + "-1337x"
+	torrentList, created, found, err := c.cache.Get(cacheKey)
+	if err != nil {
+		c.logger.Error("Couldn't get torrent results from cache", zap.Error(err), zapFieldID, zapFieldTorrentSite)
+	} else if !found {
+		c.logger.Debug("Torrent results not found in cache", zapFieldID, zapFieldTorrentSite)
+	} else if time.Since(created) > (c.cacheAge) {
+		expiredSince := time.Since(created.Add(c.cacheAge))
+		c.logger.Debug("Hit cache for torrents, but item is expired", zap.Duration("expiredSince", expiredSince), zapFieldID, zapFieldTorrentSite)
+	} else {
+		c.logger.Debug("Hit cache for torrents, returning results", zap.Int("torrentCount", len(torrentList)), zapFieldID, zapFieldTorrentSite)
+		return torrentList, nil
+	}
+
+	// Search on 1337x
+
+	reqUrl := c.baseURL + "/" + urlPath
+	origDoc, err := c.getDoc(ctx, reqUrl)
+	if err != nil {
+		return nil, err
+	}
+	// Pick the first element, it's the most likely one to belong to the correct movie / TV show
+	torrentPath, ok := origDoc.Find(".table-list tbody td a").Next().Attr("href")
+	if !ok {
+		return nil, fmt.Errorf("Couldn't find search result")
+	}
+
+	// Try to go via the first search result to the general movie page. This guarantees that all torrents found on that page are definitive matches for the movie.
+	// But this only works for movies, not for TV shows.
+	// For movies, if we don't find the general movie page, we can always go back to the original search result page as well.
+	// TODO: For TV shows we could try to go via the episode page.
+	var docToSearch *goquery.Document
+	if isTVShow {
+		reqUrl = c.baseURL + torrentPath
+		firstTorrentDoc, err := c.getDoc(ctx, reqUrl)
+		if err != nil {
+			c.logger.Warn("Couldn't get HTML doc for first torrent result", zap.Error(err), zapFieldID, zapFieldTorrentSite)
+			docToSearch = origDoc
+		} else {
+			// Find the general movie page URL
+			movieInfoURL, ok := firstTorrentDoc.Find(".content-row h3 a").Attr("href")
+			// Only if this was found, we try to go through the torrent pages for the movie page
+			if ok && movieInfoURL != "" {
+				reqUrl = c.baseURL + movieInfoURL
+				docToSearch, err = c.getDoc(ctx, reqUrl)
+				if err != nil {
+					// Only log, but continue - we can always use the results from the original search result page
+					c.logger.Warn("Couldn't get HTML doc for general movie page", zap.Error(err), zapFieldID, zapFieldTorrentSite)
+					docToSearch = origDoc
+				}
+			} else {
+				docToSearch = origDoc
+			}
+		}
+	} else {
+		docToSearch = origDoc
+	}
+	// Go through elements
+	var torrentPageURLs []string
+	docToSearch.Find(".table-list tbody tr").Each(func(i int, s *goquery.Selection) {
+		linkText := s.Find("a").Next().Text()
+		if strings.Contains(linkText, "720p") || strings.Contains(linkText, "1080p") || strings.Contains(linkText, "2160p") {
+			torrentLink, ok := s.Find("a").Next().Attr("href")
+			if !ok || torrentLink == "" {
+				c.logger.Warn("Couldn't find link to the torrent page, did the HTML change?", zapFieldID, zapFieldTorrentSite)
+				return
+			}
+			torrentPageURLs = append(torrentPageURLs, c.baseURL+torrentLink)
+		}
+	})
+	// TODO: We should differentiate between "parsing went wrong" and "just no search results".
+	if len(torrentPageURLs) == 0 {
+		return nil, nil
+	}
+
+	// Visit each torrent page *in parallel* and get the magnet URL
+
+	resultChan := make(chan Result, len(torrentPageURLs))
+
+	for _, torrentPageURL := range torrentPageURLs {
+		// Use configured base URL, which could be a proxy that we want to go through
+		torrentPageURL, err = replaceURL(torrentPageURL, c.baseURL)
+		if err != nil {
+			c.logger.Warn("Couldn't replace URL which was retrieved from an HTML link", zap.Error(err), zapFieldID, zapFieldTorrentSite)
+			continue
+		}
+
+		go func(goTorrentPageURL string) {
+			doc, err := c.getDoc(ctx, goTorrentPageURL)
+			if err != nil {
+				resultChan <- Result{}
+				return
+			}
+
+			magnet, ok := doc.Find(".box-info ul li").First().Find("a").Attr("href")
+			if !ok || magnet == "" {
+				resultChan <- Result{}
+				return
+			}
+
+			quality := ""
+			if strings.Contains(magnet, "720p") {
+				quality = "720p"
+			} else if strings.Contains(magnet, "1080p") {
+				quality = "1080p"
+			} else if strings.Contains(magnet, "2160p") {
+				quality = "2160p"
+			} else {
+				// This should never be the case, because it was previously checked during scraping
+				resultChan <- Result{}
+				return
+			}
+
+			if strings.Contains(magnet, "10bit") {
+				quality += " 10bit"
+			}
+
+			// https://en.wikipedia.org/wiki/Pirated_movie_release_types
+			if strings.Contains(magnet, "HDCam") {
+				quality += (" (⚠️cam)")
+			}
+
+			// We should mark 1337x movies somehow, because we cannot be 100% sure it's the correct movie.
+			// The quality might later be used as title, as suggested by Stremio.
+			// (Albeit only in a specific case for a specific reason)
+			quality += "\n(⚠️guessed match)"
+
+			// look for "btih:dd8255ecdc7ca55fb0bbf81323d87062db1f6d1c&" via regex and then cut out the hash
+			match := magnet2InfoHashRegex.Find([]byte(magnet))
+			infoHash := strings.TrimPrefix(string(match), "btih:")
+			infoHash = strings.TrimSuffix(infoHash, "&")
+			infoHash = strings.ToUpper(infoHash)
+
+			if infoHash == "" {
+				c.logger.Warn("Couldn't extract info_hash. Did the HTML change?", zap.String("magnet", magnet), zapFieldID, zapFieldTorrentSite)
+				resultChan <- Result{}
+				return
+			} else if len(infoHash) != 40 {
+				c.logger.Warn("InfoHash isn't 40 characters long", zap.String("magnet", magnet), zapFieldID, zapFieldTorrentSite)
+				resultChan <- Result{}
+				return
+			}
+
+			result := Result{
+				Title:     title,
+				Quality:   quality,
+				InfoHash:  infoHash,
+				MagnetURL: magnet,
+			}
+			if c.logFoundTorrents {
+				c.logger.Debug("Found torrent", zap.String("title", title), zap.String("quality", quality), zap.String("infoHash", infoHash), zap.String("magnet", magnet), zapFieldID, zapFieldTorrentSite)
+			}
+
+			resultChan <- result
+		}(torrentPageURL)
+	}
+
+	var results []Result
+	// We don't use a timeout channel because the HTTP clients have a timeout so the goroutines are guaranteed to finish
+	for i := 0; i < len(torrentPageURLs); i++ {
+		result := <-resultChan
+		if result.MagnetURL != "" {
+			results = append(results, result)
+		}
+	}
+
+	// Fill cache, even if there are no results, because that's just the current state of the torrent site.
+	// Any actual errors would have returned earlier.
+	if err := c.cache.Set(cacheKey, results); err != nil {
+		c.logger.Error("Couldn't cache torrents", zap.Error(err), zap.String("cache", "torrent"), zapFieldID, zapFieldTorrentSite)
+	}
+
+	return results, nil
+}
+
+func (c *leetxClient) IsSlow() bool {
+	return false
+}
+
+func (c *leetxClient) getDoc(ctx context.Context, url string) (*goquery.Document, error) {
+	res, err := c.httpClient.Get(url)
+	if err != nil {
+		return nil, fmt.Errorf("Couldn't GET %v: %v", url, err)
+	}
+	defer res.Body.Close()
+	if res.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("Bad GET response: %v", res.StatusCode)
+	}
+
+	// Load the HTML document
+	doc, err := goquery.NewDocumentFromReader(res.Body)
+	if err != nil {
+		return nil, fmt.Errorf("Couldn't load the HTML in goquery: %v", err)
+	}
+
+	return doc, nil
+}
diff --git a/cache.go b/cache.go
@@ -0,0 +1,60 @@
+package imdb2torrent
+
+import (
+	"sync"
+	"time"
+)
+
+// CacheItem combines Result objects and a creation time in a single struct.
+// This can be useful for implementing the Cache interface, but is not necessarily required.
+// See the InMemoryCache example implementation of the Cache interface for its usage.
+type CacheItem struct {
+	Results []Result
+	Created time.Time
+}
+
+// Cache is the interface that the imdb2torrent clients use for caching results.
+// A package user must pass an implementation of this interface.
+// Usually you create a simple wrapper around an existing cache package.
+// An example implementation is the InMemoryCache in this package.
+type Cache interface {
+	Set(key string, results []Result) error
+	Get(key string) ([]Result, time.Time, bool, error)
+}
+
+var _ Cache = (*InMemoryCache)(nil)
+
+// InMemoryCache is an example implementation of the Cache interface.
+// It doesn't persist its data, so it's not suited for production use of the imdb2torrent package.
+type InMemoryCache struct {
+	cache map[string]CacheItem
+	lock  *sync.RWMutex
+}
+
+// NewInMemoryCache creates a new InMemoryCache.
+func NewInMemoryCache() *InMemoryCache {
+	return &InMemoryCache{
+		cache: map[string]CacheItem{},
+		lock:  &sync.RWMutex{},
+	}
+}
+
+// Set stores Result objects and the current time in the cache.
+func (c *InMemoryCache) Set(key string, results []Result) error {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	c.cache[key] = CacheItem{
+		Results: results,
+		Created: time.Now(),
+	}
+	return nil
+}
+
+// Get returns Result objects and the time they were cached from the cache.
+// The boolean return value signals if the value was found in the cache.
+func (c *InMemoryCache) Get(key string) ([]Result, time.Time, bool, error) {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	cacheItem, found := c.cache[key]
+	return cacheItem.Results, cacheItem.Created, found, nil
+}