Skip to content

Commit

Permalink
Make TXT validation use local git history
Browse files Browse the repository at this point in the history
This cuts the amount of Github API traffic required to validate the entire
PSL by >99%.
  • Loading branch information
danderson committed Oct 16, 2024
1 parent 29dfff7 commit f8987d1
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 13 deletions.
155 changes: 155 additions & 0 deletions tools/internal/githistory/history.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// Package githistory provides helpers to look up PSL PR changes in a
// local git repository.
package githistory

import (
"bytes"
"fmt"
"os/exec"
"regexp"
"strconv"
"strings"
)

// PRInfo lists commit metadata for a given Github PR.
type PRInfo struct {
Num int
// CommitHash is the git hash in which the PSL contains the
// changes of this PR.
CommitHash string
// ParentHash is the git hash immediately before this PR's changes
// were added to the PSL.
ParentHash string
}

// History is PR metadata extracted from a local PSL git clone.
type History struct {
GitPath string // path to the local git clone
PRs map[int]PRInfo
}

// gitTopLevel finds the top level of the git repository that contains
// path, if any.
func gitToplevel(path string) (string, error) {
bs, err := gitStdout(path, "rev-parse", "--show-toplevel")
if err != nil {
return "", fmt.Errorf("finding top level of git repo %q: %w", path, err)
}
return string(bs), nil
}

// GetPRInfo extracts PR metadata from the git repository at gitPath.
func GetPRInfo(gitPath string) (*History, error) {
toplevel, err := gitToplevel(gitPath)
if err != nil {
return nil, err
}

// List all commits that have a description with a '(#1234)' at
// the end of a line of description or "Merge pull request #1234
// from" at the start, and print the matching commits in a form
// that's easy to parse.
prCommits, err := gitStdout(toplevel, "log",
"--perl-regexp",
`--grep=\(#\d+\)$`,
`--grep=^Merge pull request #\d+ from`,
"--pretty=%H@%P@%s",
"master")

ret := &History{
GitPath: toplevel,
PRs: map[int]PRInfo{},
}
for _, line := range strings.Split(string(prCommits), "\n") {
fs := strings.SplitN(line, "@", 3)
if len(fs) != 3 {
return nil, fmt.Errorf("unexpected line format %q", line)
}
commit, parentsStr, desc := fs[0], fs[1], fs[2]
parents := strings.Split(parentsStr, " ")
// For merge commits, we have multiple parents, and we want
// the "main branch" side of the merge, i.e. the state of the
// tree before the PR was merged. Empirically, Github always
// lists that commit as the 1st parent in merge commits.
//
// For squash commits, there is only one parent.
//
// This logic cannot handle rebase-and-merge actions, since
// those by definition erase the PR history from the git
// history. However, the PSL doesn't use rebase-and-merge by
// convention, so this works out. Worst case, if this logic
// does catch a rebase-and-merge, the result will be false
// positives (suffix flagged for invalid TXT record), if the
// PR contained more than 1 commit.
parent := parents[0]
ms := prNumberRe.FindStringSubmatch(desc)
if len(ms) != 3 {
// The grep on git log returned a false positive where the
// PR number is not on the first line of the commit
// message. This is not a commit in the standard github
// format for PRs.
continue
}

var prNum int
if ms[1] != "" {
prNum, err = strconv.Atoi(ms[1])
} else {
prNum, err = strconv.Atoi(ms[2])
}
if err != nil {
// Shouldn't happen, the regex isolates digits, why can't
// we parse digits?
return nil, fmt.Errorf("unexpected invalid PR number string %q", ms[1])
}

ret.PRs[prNum] = PRInfo{
Num: prNum,
CommitHash: commit,
ParentHash: parent,
}
}

return ret, nil
}

// GetPSL returns the PSL file at the given commit hash in the git
// repository at gitPath.
func GetPSL(gitPath string, hash string) ([]byte, error) {
toplevel, err := gitToplevel(gitPath)
if err != nil {
return nil, err
}

bs, err := gitStdout(toplevel, "show", fmt.Sprintf("%s:public_suffix_list.dat", hash))
if err != nil {
return nil, err
}

return bs, nil
}

// Matches either "(#1234)" at the end of a line, or "Merge pull
// request #1234 from" at the start of a line. The first is how github
// formats squash-and-merge commits, the second is how github formats
// 2-parent merge commits.
var prNumberRe = regexp.MustCompile(`(?:\(#(\d+)\)$)|(?:^Merge pull request #(\d+) from)`)

func gitStdout(repoPath string, args ...string) ([]byte, error) {
args = append([]string{"-C", repoPath}, args...)
c := exec.Command("git", args...)
var stderr bytes.Buffer
c.Stderr = &stderr
bs, err := c.Output()
if err != nil {
// Make the error show the git commandline and captured
// stderr, not just the plain "exited with code 45" error.
cmdline := append([]string{"git"}, args...)
var stderrStr string
if stderr.Len() != 0 {
stderrStr = "stderr:\n" + stderr.String()
}
return nil, fmt.Errorf("running %q: %w. %s", strings.Join(cmdline, " "), err, stderrStr)
}
return bytes.TrimSpace(bs), nil
}
32 changes: 27 additions & 5 deletions tools/internal/parser/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/creachadair/mds/mapset"
"github.com/creachadair/taskgroup"
"github.com/publicsuffix/list/tools/internal/domain"
"github.com/publicsuffix/list/tools/internal/githistory"
"github.com/publicsuffix/list/tools/internal/github"
)

Expand Down Expand Up @@ -124,10 +125,10 @@ func validateSuffixUniqueness(block Block) (errs []error) {
// validations are slower than offline validation, especially when
// checking the entire PSL. All online validations respect
// cancellation on the given context.
func ValidateOnline(ctx context.Context, l *List, client *github.Repo) (errs []error) {
func ValidateOnline(ctx context.Context, l *List, client *github.Repo, prHistory *githistory.History) (errs []error) {
for _, section := range BlocksOfType[*Section](l) {
if section.Name == "PRIVATE DOMAINS" {
errs = append(errs, validateTXTRecords(ctx, section, client)...)
errs = append(errs, validateTXTRecords(ctx, section, client, prHistory)...)
break
}
}
Expand Down Expand Up @@ -184,15 +185,18 @@ type txtRecordChecker struct {
// {suffixes: [foo.com]}, meaning if we look at PR 123 on github,
// we want to see a change for foo.com.
prExpected map[int]*prExpected

hist *githistory.History
}

// validateTXTRecords checks the TXT records of all Suffix and
// Wildcard blocks found under b.
func validateTXTRecords(ctx context.Context, b Block, client *github.Repo) (errs []error) {
func validateTXTRecords(ctx context.Context, b Block, client *github.Repo, prHistory *githistory.History) (errs []error) {
checker := txtRecordChecker{
ctx: ctx,
prExpected: map[int]*prExpected{},
gh: client,
gh: client,
hist: prHistory,
}

// TXT checking happens in two phases: first, look up all TXT
Expand Down Expand Up @@ -365,6 +369,24 @@ func (c *txtRecordChecker) processLookupResult(res txtResult) {
}
}

func (c *txtRecordChecker) getPRPSLs(prNum int) (before, after []byte, err error) {
if c.hist != nil {
if inf, ok := c.hist.PRs[prNum]; ok {
before, err = githistory.GetPSL(c.hist.GitPath, inf.ParentHash)
if err != nil {
return nil, nil, err
}
after, err = githistory.GetPSL(".", inf.CommitHash)
if err != nil {
return nil, nil, err
}
return before, after, nil
}
}

return c.gh.PSLForPullRequest(c.ctx, prNum)
}

// checkPRs looks up the given Github PR, and verifies that it changes
// all the suffixes and wildcards provided in info.
func (c *txtRecordChecker) checkPR(prNum int, info *prExpected) []error {
Expand All @@ -385,7 +407,7 @@ func (c *txtRecordChecker) checkPR(prNum int, info *prExpected) []error {

var ret []error

beforeBs, afterBs, err := c.gh.PSLForPullRequest(c.ctx, prNum)
beforeBs, afterBs, err := c.getPRPSLs(prNum)
if err != nil {
for _, suf := range info.suffixes {
ret = append(ret, ErrTXTCheckFailure{suf, err})
Expand Down
38 changes: 30 additions & 8 deletions tools/psltool/psltool.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/creachadair/flax"
"github.com/creachadair/mds/mdiff"
"github.com/natefinch/atomic"
"github.com/publicsuffix/list/tools/internal/githistory"
"github.com/publicsuffix/list/tools/internal/github"
"github.com/publicsuffix/list/tools/internal/parser"
)
Expand Down Expand Up @@ -136,7 +137,8 @@ func runFmt(env *command.Env, path string) error {
var validateArgs struct {
Owner string `flag:"gh-owner,default=publicsuffix,Owner of the github repository to check"`
Repo string `flag:"gh-repo,default=list,Github repository to check"`
Online bool `flag:"online-checks,Run validations that require querying third-party servers"`
Clone string `flag:"gh-local-clone,Path to a local clone of the repository specified by gh-owner/gh-repo"`
Online bool `flag:"online-checks,Run validations that require querying third-party servers"`
}

func isHex(s string) bool {
Expand All @@ -157,8 +159,10 @@ func runValidate(env *command.Env, pathOrHash string) error {
Repo: checkPRArgs.Repo,
}

isPath := false
if _, err = os.Stat(pathOrHash); err == nil {
// input is a local file
isPath = true
bs, err = os.ReadFile(pathOrHash)
} else if isHex(pathOrHash) {
// input looks like a git hash
Expand All @@ -174,13 +178,22 @@ func runValidate(env *command.Env, pathOrHash string) error {
errs = append(errs, psl.Clean()...)
errs = append(errs, parser.ValidateOffline(psl)...)
if validateArgs.Online {
if os.Getenv("PSLTOOL_ALLOW_BROKEN") == "" {
errs = append(errs, fmt.Errorf("refusing to run online validation on the entire PSL, it's currently broken and gets you rate-limited by github. For development, export PSLTOOL_ALLOW_BROKEN=1."))
} else {
ctx, cancel := context.WithTimeout(env.Context(), 1200*time.Second)
defer cancel()
errs = append(errs, parser.ValidateOnline(ctx, psl, &client)...)
if validateArgs.Clone == "" && isPath {
// Assume the PSL file being validated might be in a git
// clone, and try to use that as the reference for history.
validateArgs.Clone = filepath.Dir(pathOrHash)
}
if validateArgs.Clone == "" {
return errors.New("--gh-local-clone is required for full validation")
}
prHistory, err := githistory.GetPRInfo(validateArgs.Clone)
if err != nil {
return fmt.Errorf("failed to get local PR history, refusing to run full validation to avoid Github DoS: %w", err)
}

ctx, cancel := context.WithTimeout(env.Context(), 1200*time.Second)
defer cancel()
errs = append(errs, parser.ValidateOnline(ctx, psl, &client, prHistory)...)
}

clean := psl.MarshalPSL()
Expand All @@ -205,6 +218,7 @@ func runValidate(env *command.Env, pathOrHash string) error {
var checkPRArgs struct {
Owner string `flag:"gh-owner,default=publicsuffix,Owner of the github repository to check"`
Repo string `flag:"gh-repo,default=list,Github repository to check"`
Clone string `flag:"gh-local-clone,Path to a local clone of the repository specified by gh-owner/gh-repo"`
Online bool `flag:"online-checks,Run validations that require querying third-party servers"`
}

Expand All @@ -229,9 +243,17 @@ func runCheckPR(env *command.Env, prStr string) error {
errs = append(errs, after.Clean()...)
errs = append(errs, parser.ValidateOffline(after)...)
if checkPRArgs.Online {
var prHistory *githistory.History
if validateArgs.Clone != "" {
prHistory, err = githistory.GetPRInfo(validateArgs.Clone)
if err != nil {
return fmt.Errorf("failed to get local PR history: %w", err)
}
}

ctx, cancel := context.WithTimeout(env.Context(), 300*time.Second)
defer cancel()
errs = append(errs, parser.ValidateOnline(ctx, after, &client)...)
errs = append(errs, parser.ValidateOnline(ctx, after, &client, prHistory)...)
}

clean := after.MarshalPSL()
Expand Down

0 comments on commit f8987d1

Please sign in to comment.