diff --git a/.gitignore b/.gitignore index f3b2c9f0..867f810d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ bulktest/wptkey.inc.php +bulktest/batch.log* +bulktest/httparchive_batch_lock* node_modules/ tmp/ downloads/ harviewer/ *.log +*nohup.out diff --git a/bulktest/batch_lib.inc b/bulktest/batch_lib.inc index 77a2b6c6..d7848f6f 100644 --- a/bulktest/batch_lib.inc +++ b/bulktest/batch_lib.inc @@ -27,7 +27,7 @@ define("DONE", 4); // The status table saves $gErrBase + i to indicate that there is a permanent error happens when the test is in the status i. $gErrBase = 900; // The list of tasks for a batch run. -$gNumParse = 5; // the number of parse tasks to fork +$gNumParse = 10; // the number of parse tasks to fork $gaTasks = array("submit", "status", "obtain"); for ( $i = 1; $i <= $gNumParse; $i++ ) { array_push($gaTasks, "parse" . $i); // dynamically create the desired number of parse tasks @@ -91,12 +91,10 @@ function submitTest(&$record, $status) { $wptServer = wptServer(); $location = $record['location']; - /* if ($location == 'California:Chrome.3G') { $location = 'California:Chrome.4G'; } - */ - $request = $wptServer . 'runtest.php?f=xml&priority=6&timeline=1&url=' . urlencode($record['url']) . + $request = $wptServer . 'runtest.php?f=xml&debug=1&priority=6&timeline=1&url=' . urlencode($record['url']) . "&location=$location&runs=$runs" . ( $private ? "&private=1" : "" ) . ( $video ? "&video=1" : "" ) . @@ -138,10 +136,18 @@ function submitTest(&$record, $status) { // Submit the batch test to WPT server. function submitBatch() { + global $gMaxQueueLength; + $submittedTests = countTestsWithCode(SUBMITTED); $unsubmitTests = obtainTestsWithCode(NOT_STARTED); if ( !isEmptyQuery($unsubmitTests) ) { while ($row = mysqli_fetch_assoc($unsubmitTests)) { submitTest($row, 0); + // Limit the number of in-flight tests + if ($gMaxQueueLength) { + $submittedTests++; + if ($submittedTests >= $gMaxQueueLength) + break; + } } } } @@ -180,13 +186,13 @@ function checkWPTStatus() { $nNoResult = 0; // reset setStatus($row['statusid'], SUBMIT_DONE); } - elseif ( 400 <= $code ) { + elseif ( 400 <= $code || 100 > $code ) { $nNoResult = 0; // reset setStatus($row['statusid'], SUBMITTED + $gErrBase); } else { $nNoResult++; - if ( $nNoResult > 200 ) { + if ( $nNoResult > 2000 ) { // Quick bail: // If we've exhausted all the completed results we do NOT want // to continue checking EVERY remaining test. So instead we bail @@ -966,7 +972,10 @@ function prettyType($mimeType, $ext) { else if ( "xml" === $ext ) { return "xml"; } - else if ( false !== strpos($mimeType, "flash") || "mp4" === $ext || "swf" === $ext || "f4v" === $ext || "flv" === $ext ) { + //Video extensions mp4, webm, ts, m4v, m4s, m4v, mov, ogv + else if ( false !== strpos($mimeType, "flash") || false !== strpos($mimeType, "webm") || false !== strpos($mimeType, "mp4") || false !== strpos($mimeType, "flv") + || "mp4" === $ext || "webm" === $ext || "ts" == $ext || "m4v" === $ext || "m4s" === $ext || "mov" === $ext || "ogv" === $ext + || "swf" === $ext || "f4v" === $ext || "flv" === $ext ) { return "video"; } else if ( false !== strpos($mimeType, "html") || "html" === $ext || "htm" === $ext ) { diff --git a/bulktest/batch_process.php b/bulktest/batch_process.php index 567b2c1f..c1b3a6c2 100755 --- a/bulktest/batch_process.php +++ b/bulktest/batch_process.php @@ -72,6 +72,10 @@ updateCrawl($labelFromRun, $gArchive, $locations[0], array( "finishedDateTime" => time() )); + // Cleanup the requests table after the dump is complete (it is only used during a crawl to calculate aggregate stats) + doSimpleCommand("TRUNCATE TABLE $gRequestsTable;"); + doSimpleCommand("optimize table $gRequestsTable;"); + cprint(date("G:i") . ": DONE with crawl!"); exit(0); } diff --git a/bulktest/batch_start.php b/bulktest/batch_start.php index 43fd28c3..dfcbb7c0 100755 --- a/bulktest/batch_start.php +++ b/bulktest/batch_start.php @@ -110,10 +110,10 @@ loadUrlsFromDB($crawlid, $label, $gNumUrls); } else if ( $gbMobile ) { - loadUrlsFromDB($crawlid, $label, 1500000); + loadUrlsFromDB($crawlid, $label, 10000000); } else if ( $gbDev ) { - loadUrlsFromDB($crawlid, $label, 1500000); + loadUrlsFromDB($crawlid, $label, 10000000); } $numUrls = doSimpleQuery("select count(*) from $gStatusTable where crawlid=$crawlid;"); diff --git a/bulktest/bootstrap.inc b/bulktest/bootstrap.inc index f4ad0bd0..f1a435f2 100644 --- a/bulktest/bootstrap.inc +++ b/bulktest/bootstrap.inc @@ -28,7 +28,7 @@ $private = false; $docComplete = false; // &web10 WPT param $gbMobileEmul = ( $gbAndroid || $gbMobile ? 1 : 0 ); // WPT param for whether we should use mobile emulation $fvonly = true; -$runs = 3; +$runs = 1; $mv = 1; // only save video for Median run $gbNoScript = false; $wptApiKey = ''; diff --git a/bulktest/cleanup-requests.php b/bulktest/cleanup-requests.php index 5cb9c5d3..90a5f24d 100644 --- a/bulktest/cleanup-requests.php +++ b/bulktest/cleanup-requests.php @@ -17,66 +17,38 @@ // The purpose is to free up disk space taken up by rows in the // "requests" and "requestsdev" tables. We do NOT need these rows for // any part of the UI, and all of the requests are archived in a dump -// file for each crawl. I'm too nervous to delete the rows automatically -// as part of the crawl process. Instead, once everything looks okay, -// I run this script manually. +// file for each crawl. require_once("bootstrap.inc"); require_once("../utils.inc"); $now = time(); -$gbActuallyDoit = false; -if ( array_key_exists(1, $argv) ) { - if ( "DOIT" == $argv[1] ) { - $gbActuallyDoit = true; - } - else { - cprint("Do 'php cleanup-requests.php DOIT' to actually delete the rows."); - } -} - -$gSkipRuns = 1; // how many runs we want to skip and leave their requests intact echo exec("df -h .") . "\n"; -cleanupRequests("California:Chrome", "requestsdev"); -cleanupRequests("California:Chrome", "requests"); -cleanupRequests("California2:Chrome.3G", "requestsmobiledev"); -cleanupRequests("California2:Chrome.3G", "requestsmobile"); - -echo "DONE\n\n"; - -function cleanupRequests($location, $table) { - global $gSkipRuns, $gbActuallyDoit; +$nUnfinished = doSimpleQuery("select count(*) from crawls where finishedDateTime is null;"); +if ( 0 < $nUnfinished ) { + cprint("SORRY! There is an unfinished crawl. Skipping the cleanup while the crawl is running."); + exit(1); +} - $query = "select * from crawls where location = '$location' and finishedDateTime is not null order by crawlid desc limit " . ($gSkipRuns+1) . ";"; - $results = doQuery($query); - mysqli_data_seek($results, $gSkipRuns); - $row = mysqli_fetch_assoc($results); +cleanupRequests("requestsdev"); +cleanupRequests("requests"); +cleanupRequests("requestsmobiledev"); +cleanupRequests("requestsmobile"); - if ( $gbActuallyDoit ) { - $nUnfinished = doSimpleQuery("select count(*) from crawls where location = '$location' and finishedDateTime is null;"); - if ( 0 < $nUnfinished ) { - cprint("SORRY! There is an unfinished crawl for location '$location'. Skipping the cleanup while the crawl is running."); - return; - } +echo "DONE\n\n"; - // Actually delete rows and optimize the table. - cprint("Delete requests from \"$table\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']} and earlier..."); - $cmd = "delete from $table where crawlid <= {$row['crawlid']};"; - cprint("$cmd"); - doSimpleCommand($cmd); - cprint("Optimize table \"$table\"..."); - doSimpleCommand("optimize table $table;"); - cprint("Done with table \"$table\"."); - } - else { - // How many rows would be deleted? - $numRows = doSimpleQuery("select count(*) from $table where crawlid <= {$row['crawlid']};"); - cprint("$numRows rows to be deleted for $location in $table."); +function cleanupRequests($table) { + global $lastCrawl; - cprint("WOULD delete requests from \"$table\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']} and earlier..."); - } + // Actually delete rows and optimize the table. + $cmd = "TRUNCATE TABLE $table;"; + cprint("$cmd"); + doSimpleCommand($cmd); + cprint("Optimize table \"$table\"..."); + doSimpleCommand("optimize table $table;"); + cprint("Done with table \"$table\"."); echo exec("df -h .") . "\n"; } diff --git a/bulktest/importurls.php b/bulktest/importurls.php index 282450a4..be92a823 100644 --- a/bulktest/importurls.php +++ b/bulktest/importurls.php @@ -50,6 +50,10 @@ if ( "alexa" === $gFileType ) { doSimpleCommand("update $gUrlsTable set ranktmp=null;"); } +// Clear out existing CrUX URLs. +else if ( "other" === $gFileType ) { + doSimpleCommand("truncate table $gUrlsTable;"); +} $handle = @fopen($gUrlsFile, "r"); diff --git a/crawls.inc b/crawls.inc index 6a217a2e..cbe689c0 100644 --- a/crawls.inc +++ b/crawls.inc @@ -271,7 +271,7 @@ function dumpCrawl2($label, $archive=null, $location=null, $bMysql=true, $bCsv=t $dumpfile = dumpfileName2($tablename, "csv") . ".gz"; tprint("Creating dump file $dumpfile..."); $dumpfileSql = dumpfileName2($tablename, "sql"); - $tmpdir = "/tmp/$tablename." . time(); // Unique dir for this dump cuz mysqldump writes files that aren't writable by this process, and mysqldump -T can NOT overwrite existing files. + $tmpdir = "/var/tmp/$tablename." . time(); // Unique dir for this dump cuz mysqldump writes files that aren't writable by this process, and mysqldump -T can NOT overwrite existing files. exec("mkdir $tmpdir; chmod 777 $tmpdir;"); $cmd = "mysqldump --opt --complete-insert --skip-add-drop-table -u $gMysqlUsername -p$gMysqlPassword -h $gMysqlServer -T $tmpdir $gMysqlDb $tablename; " . "gzip -f -c $tmpdir/$tablename.txt > $dumpfile ; cp $tmpdir/$tablename.sql $dumpfileSql"; @@ -306,7 +306,7 @@ function dumpCrawl($label, $archive=null, $location=null) { // pages csv // Unique dir for this dump cuz mysqldump writes files that aren't writable by this process, and mysqldump -T can NOT overwrite existing files. $labelUnderscore = str_replace(" ", "_", $label); - $tmpdir = "/tmp/$labelUnderscore." . time(); + $tmpdir = "/var/tmp/$labelUnderscore." . time(); $cmd = "mkdir $tmpdir; chmod 777 $tmpdir;"; exec($cmd); $dumpfile = dumpfileName($label, "pages", "csv"); diff --git a/dbapi.inc b/dbapi.inc index 03ed4b7b..70a94377 100644 --- a/dbapi.inc +++ b/dbapi.inc @@ -39,7 +39,7 @@ $gUrlsChangeTableDesktop = $gUrlsChangeTable; // Mobile tables $gPagesTableMobile = $gPagesTable . "mobile"; $gRequestsTableMobile = $gRequestsTable . "mobile"; -$gUrlsTableMobile = "urls"; +$gUrlsTableMobile = "urlsmobile"; $gStatusTableMobile = $gStatusTable . "mobile"; $gStatsTableMobile = $gStatsTable; // share the data table - a first step toward a single DB @@ -98,6 +98,7 @@ else if ( $gbMobile ) { $gRequestsTable = $gRequestsTableMobile; $gStatusTable = $gStatusTableMobile; $gStatsTable = $gStatsTableMobile; + $gUrlsTable = $gUrlsTableMobile; } else if ( $gbChrome ) { // Use a chrome version of the database tables if "chrome" is in the path. diff --git a/settings.inc b/settings.inc index 455d68e0..03ef3cbf 100644 --- a/settings.inc +++ b/settings.inc @@ -36,6 +36,8 @@ $gHAUrl = "http://httparchive.org/"; $gHAMUrl = "http://mobile.httparchive.org/"; $gWPTUrl = "//httparchive.webpagetest.org/"; +$gMaxQueueLength = 60000; + $gbPrivateInstance = false; ?>