Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Systematically survey message content for unimplemented features. #917

Merged
merged 2 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion pubspec.lock
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ packages:
source: hosted
version: "5.1.1"
args:
dependency: transitive
dependency: "direct dev"
description:
name: args
sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a"
Expand Down Expand Up @@ -622,6 +622,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "0.2.1+1"
ini:
dependency: "direct dev"
Comment on lines +625 to +626
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commit message understates this change:

deps: Make args and ini direct dev dependencies.

For args we're just converting it from a transitive dependency to a direct (dev) dependency.

But for ini the change is bigger: it wasn't a dependency at all, and now it is one.

So the commit message should describe the most important change. Then the args part is minor, and is fine to squash into the same commit and just mention in passing.

description:
name: ini
sha256: "12a76c53591ffdf86d1265be3f986888a6dfeb34a85957774bc65912d989a173"
url: "https://pub.dev"
source: hosted
version: "2.1.0"
integration_test:
dependency: "direct dev"
description: flutter
Expand Down
2 changes: 2 additions & 0 deletions pubspec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,15 @@ dev_dependencies:
integration_test:
sdk: flutter

args: ^2.5.0
build_runner: ^2.3.3
checks: ^0.3.0
clock: ^1.1.1
drift_dev: ^2.5.2
fake_async: ^1.3.1
flutter_checks: ^0.1.1
flutter_lints: ^4.0.0
ini: ^2.1.0
json_serializable: ^6.5.4
legacy_checks: ^0.1.0
pigeon: ^20.0.1
Expand Down
108 changes: 108 additions & 0 deletions tools/content/check-features
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env bash
set -euo pipefail

default_steps=(fetch check)

usage() {
cat <<EOF
usage: tools/content/check-features [OPTION]... [STEP]... <CORPUS_DIR>

Fetch messages from a Zulip server and check the content parser for
unimplemented features.

By default, run the following steps:
${default_steps[*]}

CORPUS_DIR is required. It is the directory to store or read corpus files.
This directory will be created if it does not exist already.

The steps are:

fetch Fetch the corpus needed from the server specified via the config
file into \`CORPUS_DIR\` incrementally. This step can take a long
time on servers with a lot of public messages when starting from
scratch.
This wraps around tools/content/fetch_messages.dart.

check Check for unimplemented content parser features. This requires
the corpus directory \`CORPUS_DIR\` to contain at least one corpus
file.
This wraps around tools/content/unimplemented_features_test.dart.

Options:

--config <FILE>
A zuliprc file with identity information including email, API key
and the Zulip server URL to fetch the messages from.
Mandatory if running step \`fetch\`. To get the file, see
https://zulip.com/api/configuring-python-bindings#download-a-zuliprc-file.

--verbose Print more details about everything, especially when checking for
unsupported features.

--help Show this help message.
EOF
}

opt_corpus_dir=
opt_zuliprc=
opt_verbose=
opt_steps=()
while (( $# )); do
case "$1" in
fetch|check) opt_steps+=("$1"); shift;;
--config) shift; opt_zuliprc="$1"; shift;;
--verbose) opt_verbose=1; shift;;
--help) usage; exit 0;;
*)
if [ -n "$opt_corpus_dir" ]; then
# Forbid passing multiple corpus directories.
usage >&2; exit 2
fi
opt_corpus_dir="$1"; shift;;
esac
done

if [ -z "$opt_corpus_dir" ]; then
echo >&2 "Error: Positional argument CORPUS_DIR is required."
echo >&2
usage >&2; exit 2
fi

if (( ! "${#opt_steps[@]}" )); then
opt_steps=( "${default_steps[@]}" )
fi

run_fetch() {
if [ -z "$opt_zuliprc" ]; then
echo >&2 "Error: Option \`--config\` is required for step \`fetch\`."
echo >&2
usage >&2; exit 2
fi

if [ -n "$opt_verbose" ]; then
echo "Fetching all public messages using API config \"$opt_zuliprc\"." \
" This can take a long time."
fi
# This may have a side effect of creating or modifying the corpus
# file named after the Zulip server's host name.
dart tools/content/fetch_messages.dart --config-file "$opt_zuliprc" \
--corpus-dir "$opt_corpus_dir" \
|| return 1
}

run_check() {
flutter test tools/content/unimplemented_features_test.dart \
--dart-define=corpusDir="$opt_corpus_dir" \
--dart-define=verbose="$opt_verbose" \
|| return 1
}

for step in "${opt_steps[@]}"; do
echo "Running ${step}"
case "${step}" in
fetch) run_fetch ;;
check) run_check ;;
*) echo >&2 "Internal error: unknown step ${step}" ;;
esac
done
233 changes: 233 additions & 0 deletions tools/content/fetch_messages.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import 'dart:convert';
import 'dart:io';
import 'dart:math';

// Avoid any Flutter-related dependencies so this can be run as a CLI program.
import 'package:args/args.dart';
import 'package:http/http.dart' as http;
import 'package:ini/ini.dart' as ini;
import 'package:zulip/api/backoff.dart';

import 'model.dart';

/// Fetch all public message contents from a Zulip server in bulk.
///
/// It outputs JSON entries of the message IDs and the rendered HTML contents in
/// JSON Lines (https://jsonlines.org) format. The output can be used later to
/// perform checks for discovering unimplemented features.
///
/// Because message IDs are only unique within a single server, the script
/// names corpora by the server host names.
///
/// This script is meant to be run via `tools/content/check-features`.
///
/// For more help, run `tools/content/check-features --help`.
///
/// See also:
/// * tools/content/unimplemented_features_test.dart, which runs checks against
/// the fetched corpora.
void main(List<String> args) async {
final argParser = ArgParser();
argParser.addOption(
'config-file',
help: 'A zuliprc file with identity information including email, API key\n'
'and the Zulip server URL to fetch the messages from (required).\n\n'
'To get the file, see\n'
'https://zulip.com/api/configuring-python-bindings#download-a-zuliprc-file.',
valueHelp: 'path/to/zuliprc',
);
argParser.addOption(
'corpus-dir',
help: 'The directory to look for/store the corpus file (required).\n'
'The script will first read from the existing corpus file\n'
'(assumed to be named as "your-zulip-server.com.jsonl")\n'
'to avoid duplicates before fetching more messages.',
valueHelp: 'path/to/corpus-dir',
);
argParser.addFlag(
'fetch-newer',
help: 'Fetch newer messages instead of older ones.\n'
'Only useful when there is a matching corpus file in corpus-dir.',
defaultsTo: false,
);
argParser.addFlag(
'help', abbr: 'h',
negatable: false,
help: 'Show this help message.',
);

void printUsage() {
// Give it a pass when printing the help message.
// ignore: avoid_print
print('usage: fetch_messages --config-file <CONFIG_FILE>\n\n'
'Fetch message contents from a Zulip server in bulk.\n\n'
'${argParser.usage}');
}

Never throwWithUsage(String error) {
printUsage();
throw Exception('\nError: $error');
}

final parsedArguments = argParser.parse(args);
if (parsedArguments['help'] as bool) {
printUsage();
exit(0);
}

final zuliprc = parsedArguments['config-file'] as String?;
if (zuliprc == null) {
throwWithUsage('"config-file is required');
}

final configFile = File(zuliprc);
if (!configFile.existsSync()) {
throwWithUsage('Config file "$zuliprc" does not exist');
}

// `zuliprc` is a file in INI format containing the user's identity
// information.
//
// See also:
// https://zulip.com/api/configuring-python-bindings#configuration-keys-and-environment-variables
final parsedConfig = ini.Config.fromString(configFile.readAsStringSync());
await fetchMessages(
email: parsedConfig.get('api', 'email') as String,
apiKey: parsedConfig.get('api', 'key') as String,
site: Uri.parse(parsedConfig.get('api', 'site') as String),
outputDirStr: parsedArguments['corpus-dir'] as String,
fetchNewer: parsedArguments['fetch-newer'] as bool,
);
}

Future<void> fetchMessages({
required String email,
required String apiKey,
required Uri site,
required String outputDirStr,
required bool fetchNewer,
}) async {
int? anchorMessageId;
final outputDir = Directory(outputDirStr);
outputDir.createSync(recursive: true);
final outputFile = File('$outputDirStr/${site.host}.jsonl');
if (!outputFile.existsSync()) outputFile.createSync();
// Look for the known newest/oldest message so that we can continue
// fetching from where we left off.
await for (final message in readMessagesFromJsonl(outputFile)) {
anchorMessageId ??= message.id;
// Newer Zulip messages have higher message IDs.
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
}
final output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);

final client = http.Client();
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';

// These are working constants chosen arbitrarily.
const batchSize = 5000;
const maxRetries = 10;
const fetchInterval = Duration(seconds: 5);

int retries = 0;
BackoffMachine? backoff;

while (true) {
// This loops until there is no message fetched in an iteration.
final _GetMessagesResult result;
try {
// This is the one place where some output would be helpful,
// for indicating progress.
// ignore: avoid_print
print('Fetching $batchSize messages starting from message ID $anchorMessageId');
result = await _getMessages(client, realmUrl: site,
authHeader: authHeader,
anchorString: anchorMessageId != null ? jsonEncode(anchorMessageId)
: fetchNewer ? 'oldest' : 'newest',
// When the anchor message does not exist in the corpus,
// we should include it.
includeAnchor: anchorMessageId == null,
numBefore: (!fetchNewer) ? batchSize : 0,
numAfter: (fetchNewer) ? batchSize : 0,
);
} catch (e) {
// We could have more fine-grained error handling and avoid retrying on
// non-network-related failures, but that's not necessary.
if (retries >= maxRetries) {
rethrow;
}
retries++;
await (backoff ??= BackoffMachine()).wait();
continue;
}

final messageEntries = result.messages.map(MessageEntry.fromJson);
if (messageEntries.isEmpty) {
// Sanity check to ensure that the server agrees
// there is no more messages to fetch.
if (fetchNewer) assert(result.foundNewest);
if (!fetchNewer) assert(result.foundOldest);
break;
}

// Find and use the newest/oldest message as the next message fetch anchor.
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));

// This I/O operation could fail, but crashing is fine here.
final flushFuture = output.flush();
// Make sure the delay happens concurrently to the flush.
await Future<void>.delayed(fetchInterval);
await flushFuture;
backoff = null;
}
}

/// https://zulip.com/api/get-messages#response
// Partially ported from [GetMessagesResult] to avoid depending on Flutter libraries.
class _GetMessagesResult {
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);

final bool foundOldest;
final bool foundNewest;
final List<Map<String, Object?>> messages;

factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
_GetMessagesResult(
json['found_oldest'] as bool,
json['found_newest'] as bool,
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
}

/// https://zulip.com/api/get-messages
Future<_GetMessagesResult> _getMessages(http.Client client, {
required Uri realmUrl,
required String authHeader,
required String anchorString,
required bool includeAnchor,
required int numBefore,
required int numAfter,
}) async {
final url = realmUrl.replace(
path: '/api/v1/messages',
queryParameters: {
// This fallback will only be used when first fetching from a server.
'anchor': anchorString,
'include_anchor': jsonEncode(includeAnchor),
'num_before': jsonEncode(numBefore),
'num_after': jsonEncode(numAfter),
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
});
final response = await client.send(
http.Request('GET', url)..headers['Authorization'] = authHeader);
final bytes = await response.stream.toBytes();
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;

if (response.statusCode != 200 || json == null) {
// Just crashing early here should be fine for this tool. We don't need
// to handle the specific error codes.
throw Exception('Failed to get messages. Code: ${response.statusCode}\n'
'Details: ${json ?? 'unknown'}');
}
return _GetMessagesResult.fromJson(json);
}
Loading