Skip to content

Commit

Permalink
tools/content: Support systematically surveying unimplemented content…
Browse files Browse the repository at this point in the history
… features.

We added 2 scripts.

- fetch_messages.dart, the script that fetches messages from a given
  Zulip server, that does not depend on Flutter or other involved Zulip
  Flutter packages, so that it can run without Flutter.  It is meant to
  be run first to produce the corpuses needed for surveying the
  unimplemented features.

  The fetched messages are formatted in JSON Lines format, where each
  individual entry is JSON containing the message ID and the rendered
  HTML content.  The user is encouraged to have a separate file for
  messages from each server, because message IDs are not unique across
  them.

- unimplemented_features_test.dart, a test that goes over all messages
  collected, parses then with the content parser, and report the
  unimplemented features it discovered.  This is implemented as a test
  mainly because of its dependency on the content parser, which depends
  on Flutter.  It has be run manually via:
  `flutter test --dart-define=corpusDir=path/to/corpusDir tools/content`
  See comments from the file for more instructions.

Fixes: #190

Signed-off-by: Zixuan James Li <[email protected]>
  • Loading branch information
PIG208 committed Aug 29, 2024
1 parent 7b24f6d commit b0d8d5a
Show file tree
Hide file tree
Showing 3 changed files with 389 additions and 0 deletions.
222 changes: 222 additions & 0 deletions tools/content/fetch_messages.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#!/usr/bin/env dart

import 'dart:convert';
import 'dart:io';
import 'dart:math';

// Avoid any Flutter-related dependencies so this can be run as a CLI program.
import 'package:args/args.dart';
import 'package:http/http.dart';
import 'package:zulip/api/backoff.dart';

import 'model.dart';

/// Fetch message contents from the specified Zulip server in bulk.
///
/// It outputs JSON entries of the message IDs and the rendered HTML contents in
/// JSON Lines (https://jsonlines.org) format. The output can be used later to
/// perform checks for discovering unimplemented features.
///
/// Because message IDs are only unique within a single server, it is encouraged
/// to store corpuses from each server separately to avoid confusion when
/// identifying messages.
///
/// See tools/content/unimplemented_features_test.dart for more details.
void main(List<String> args) async {
final argParser = ArgParser();
argParser.addOption(
'email',
help: 'The email. See https://zulip.com/api/api-keys for help.',
mandatory: true,
);
argParser.addOption(
'api-key',
help: 'The API key. See https://zulip.com/api/api-keys for help.',
mandatory: true,
);
argParser.addOption(
'site',
help: 'The URL of the Zulip server to fetch messages from.',
valueHelp: 'https://example.zulip.com',
mandatory: true,
);
argParser.addOption(
'file',
help: 'The file to output the messages to. If not given, write output to\n'
'stdout. Otherwise, if the file exists, its format should match the\n'
'output of the program. This will first read from the file to avoid\n'
'duplicates, by fetching messages starting from the newest/oldest\n'
'known message, then append the output to the end of the file.',
valueHelp: 'path/to/czo.jsonl',
);
argParser.addOption(
'count',
defaultsTo: '100',
help: 'The total number of messages to fetch.',
);
argParser.addFlag(
'fetch-newer',
help: 'Fetch newer messages instead of older ones.\n'
'Only useful when --file is supplied.',
defaultsTo: false,
);
argParser.addFlag(
'help', abbr: 'h',
negatable: false,
help: 'Show this help message.',
);

void printUsage() {
// Give it a pass when printing the help message.
// ignore: avoid_print
print('usage: fetch_messages --email <EMAIL> --api-key <API_KEY> --site <SERVER_URL>\n\n'
'Fetch message contents from the specified Zulip server in bulk.\n\n'
'${argParser.usage}');
}

Never throwWithUsage(String error) {
printUsage();
throw Exception('\nError: $error');
}

final parsedArguments = argParser.parse(args);
if (parsedArguments['help'] as bool) {
printUsage();
exit(0);
}

final email = parsedArguments['email'] as String?;
if (email == null) throwWithUsage('Option email is required');

final apiKey = parsedArguments['api-key'] as String?;
if (apiKey == null) throwWithUsage('Option api-key is required');

final realmUrlStr = parsedArguments['site'] as String?;
if (realmUrlStr == null) throwWithUsage('Option site is required');
final realmUrl = Uri.parse(realmUrlStr);

final count = int.parse(parsedArguments['count'] as String);

final outputPath = parsedArguments['file'] as String?;
final fetchNewer = parsedArguments['fetch-newer'] as bool;
int? anchorMessageId;
IOSink output = stdout;
if (outputPath != null) {
final outputFile = File(outputPath);
if (!outputFile.existsSync()) {
outputFile.createSync();
}
await for (final message in readMessagesFromJsonl(outputFile)) {
// Find the newest/oldest message ID as the anchor.
anchorMessageId ??= message.id;
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
}
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);
}

final client = Client();
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';

// These are working constants chosen abitrarily.
const batchSize = 5000;
const maxRetries = 10;
const fetchInterval = Duration(seconds: 5);

int retries = 0;
int messageToFetch = count;
BackoffMachine? backoff;

while (messageToFetch > 0) {
// Fetch messages in batches from newer messages to older messages by
// default, until there aren't any more messages to be fetched.
// Note that newer Zulip messages have higher IDs.
final currentBatchSize = (batchSize < messageToFetch) ? batchSize : messageToFetch;
final _GetMessagesResult result;
try {
result = await _getMessages(client, realmUrl: realmUrl,
authHeader: authHeader,
anchorMessageId: anchorMessageId,
numBefore: (!fetchNewer) ? currentBatchSize : 0,
numAfter: (fetchNewer) ? currentBatchSize : 0,
);
} catch (e) {
// We could have more fine-grained error handling and avoid retrying on
// non-network-related failures, but that's skipped for now.
if (retries >= maxRetries) {
rethrow;
}
retries++;
await (backoff ??= BackoffMachine()).wait();
continue;
}

final messageEntries = result.messages.map(MessageEntry.fromRawMessage);
if (messageEntries.isEmpty) {
if (fetchNewer) assert(result.foundNewest);
if (!fetchNewer) assert(result.foundOldest);
break;
}

// Find and use the newest/oldest message as the next message fetch anchor.
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));
messageToFetch -= messageEntries.length;

// This I/O operation could fail, but crashing is fine here.
final flushFuture = output.flush();
// Make sure the delay happens concurrently to the flush.
if (messageToFetch > 0) await Future<void>.delayed(fetchInterval);
await flushFuture;
backoff = null;
}
exit(0);
}

/// https://zulip.com/api/get-messages#response
// Ported from [GetMessagesResult] to avoid depending on Flutter libraries.
class _GetMessagesResult {
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);

final bool foundOldest;
final bool foundNewest;
final List<Map<String, Object?>> messages;

factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
_GetMessagesResult(
json['found_oldest'] as bool,
json['found_newest'] as bool,
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
}

Future<_GetMessagesResult> _getMessages(Client client, {
required Uri realmUrl,
required String authHeader,
required int numBefore,
required int numAfter,
int? anchorMessageId,
}) async {
final url = realmUrl.replace(
path: '/api/v1/messages',
queryParameters: {
// This fallback will be used when there is no file given,
// and there is no known messages.
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest',
// A known anchor message already exists in the output,
// so avoid fetching it again.
'include_anchor': jsonEncode(anchorMessageId == null),
'num_before': jsonEncode(numBefore),
'num_after': jsonEncode(numAfter),
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
});
final response = await client.send(
Request('GET', url)..headers['Authorization'] = authHeader);
final bytes = await response.stream.toBytes();
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;

if (response.statusCode != 200 || json == null) {
// We could handle rate limiting or other error codes, but just crashing
// early here should be fine for this tool.
throw Exception('Failed to get messages. Code: ${response.statusCode}\nDetails: ${json ?? 'unknown'}');
}
return _GetMessagesResult.fromJson(json);
}
38 changes: 38 additions & 0 deletions tools/content/model.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import 'dart:io';
import 'dart:convert';

import 'package:json_annotation/json_annotation.dart';

/// A data structure representing a message.
@JsonSerializable()
final class MessageEntry {
const MessageEntry({
required this.id,
required this.html,
});

/// Selectively parses from get-message responses.
///
/// See also: https://zulip.com/api/get-messages#response
factory MessageEntry.fromRawMessage(Map<String, Object?> json) =>
MessageEntry(id: (json['id'] as num).toInt(), html: json['content'] as String);

factory MessageEntry.fromJson(Map<String, Object?> json) =>
MessageEntry(id: (json['id'] as num).toInt(), html: json['html'] as String);

Map<String, Object> toJson() => {'id': id, 'html': html};

/// The message ID, unique within a server.
final int id;

/// The rendered HTML of the message.
final String html;
}

/// Open the given JSON Lines file and read [MessageEntry] from it.
///
/// We store the entries in JSON Lines format and return them from a stream to
/// avoid excessive use of memory.
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead()
.transform(utf8.decoder).transform(const LineSplitter())
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>));
129 changes: 129 additions & 0 deletions tools/content/unimplemented_features_test.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
@Timeout(Duration(minutes: 10))
library;

import 'dart:io';
import 'dart:math';

import 'package:checks/checks.dart';
import 'package:html/dom.dart' as dom;
import 'package:flutter/foundation.dart';
import 'package:flutter_test/flutter_test.dart';
import 'package:zulip/model/content.dart';

import 'model.dart';


/// Check if there are unimplemented features from the given corpuses of HTML
/// contents from Zulip messages.
///
/// This test is meant to be manually run.
///
/// To run it, use:
///
/// flutter test tools/content --dart-define=corpusDir=path/to/corpusDir
///
/// where `path/to/corpusDir` should be a directory containing files with
/// outputs generated from tools/content/fetch_messages.dart.
///
/// Optionally, you can enable more details with `--dart-define=verbose=true`.
///
/// The test writes an overview of unimplemented features at the beginning to
/// standard output, followed by the details of each feature. To look for live
/// examples, you can search on the Zulip community by message ID from all
/// public channels.
///
/// For example, a search query like "near: 12345 channels: public" would work.
///
/// See also:
/// * lib/model/content.dart, the implementation of the content parser.
/// * tools/content/fetch_messages.dart, the script that produces the corpuses.
void main() async {
Future<void> checkForUnimplementedFeatureInFile(File file) async {
final messageIdsByFeature = <String, Set<int>>{};
final contentsByFeature = <String, List<String>>{};

await for (final message in readMessagesFromJsonl(file)) {
_walk(message.id, parseContent(message.html).toDiagnosticsNode(),
messageIdsByFeature: messageIdsByFeature,
contentsByFeature: contentsByFeature);
}

// This buffer allows us to avoid using prints directly.
final outputLines = <String>[];
if (messageIdsByFeature.isNotEmpty) outputLines.add('Found unimplemented features:');
for (final featureName in messageIdsByFeature.keys) {
Set<int> messageIds = messageIdsByFeature[featureName]!;
int oldestId = messageIds.reduce(min);
int newestId = messageIds.reduce(max);
outputLines.add('- `$featureName`\n Oldest message: $oldestId; newest message: $newestId\n');
}
outputLines.add('');

final divider = '\n\n${'=' * 80}\n\n';
int unsupportedCounter = 0;
for (final MapEntry(key: featureName, value: messageContents) in contentsByFeature.entries) {
unsupportedCounter++;
if (!_verbose) continue;
outputLines.addAll([
'Unsupported feature #$unsupportedCounter: $featureName',
'message IDs:\n${messageIdsByFeature[featureName]!.join(', ')}',
'first 10 examples:\n${messageContents.take(10).join(divider)}',
'\n',
]);
}
check(unsupportedCounter, because: outputLines.join('\n')).equals(0);
}

final corpusFiles = _getCorpusFiles();
group('Check for unimplemented features in', () {
for (final file in corpusFiles) {
test(file.path, () => checkForUnimplementedFeatureInFile(file));
}
}, skip: corpusFiles.isEmpty);
}

// Determine whether details about all messages with unimplemented features
// should be printed.
const bool _verbose = bool.fromEnvironment('verbose');

const String _corpusDirPath = String.fromEnvironment('corpusDir');

Iterable<File> _getCorpusFiles() {
final corpusDir = Directory(_corpusDirPath);
return corpusDir.existsSync() ? corpusDir.listSync().whereType<File>() : [];
}

/// Walk the tree looking for unimplemented nodes, and aggregate them by the
/// category of the unimplemented feature.
///
/// This modifies `messageIdsByFeature` and `contentsByFeature` in-place.
void _walk(int messageId, DiagnosticsNode node, {
required Map<String, Set<int>> messageIdsByFeature,
required Map<String, List<String>> contentsByFeature,
}) {
final value = node.value;
if (value is! UnimplementedNode) {
for (final child in node.getChildren()) {
_walk(messageId, child,
messageIdsByFeature: messageIdsByFeature,
contentsByFeature: contentsByFeature);
}
return;
}

// `featureName` is a prettified identifier used for categorizing
// unimplemented features that are likely closely related.
final String featureName;
final htmlNode = value.debugHtmlNode;
if (htmlNode is dom.Element) {
if (htmlNode.className.isEmpty) {
featureName = '<${htmlNode.localName!}>';
} else {
featureName = '<${htmlNode.localName!} class="${htmlNode.classes.join(" ")}">';
}
} else {
featureName = 'DOM node type: ${htmlNode.nodeType}';
}
(messageIdsByFeature[featureName] ??= {}).add(messageId);
(contentsByFeature[featureName] ??= []).add(value.debugHtmlText);
}

0 comments on commit b0d8d5a

Please sign in to comment.