-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tools/content: Support systematically surveying unimplemented content…
… features. We added 2 scripts. - fetch_messages.dart, the script that fetches messages from a given Zulip server, that does not depend on Flutter or other involved Zulip Flutter packages, so that it can run without Flutter. It is meant to be run first to produce the corpuses needed for surveying the unimplemented features. The fetched messages are formatted in JSON Lines format, where each individual entry is JSON containing the message ID and the rendered HTML content. The user is encouraged to have a separate file for messages from each server, because message IDs are not unique across them. - unimplemented_features_test.dart, a test that goes over all messages collected, parses then with the content parser, and report the unimplemented features it discovered. This is implemented as a test mainly because of its dependency on the content parser, which depends on Flutter. It has be run manually via: `flutter test --dart-define=corpusDir=path/to/corpusDir tools/content` See comments from the file for more instructions. Fixes: #190 Signed-off-by: Zixuan James Li <[email protected]>
- Loading branch information
Showing
3 changed files
with
389 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
#!/usr/bin/env dart | ||
|
||
import 'dart:convert'; | ||
import 'dart:io'; | ||
import 'dart:math'; | ||
|
||
// Avoid any Flutter-related dependencies so this can be run as a CLI program. | ||
import 'package:args/args.dart'; | ||
import 'package:http/http.dart'; | ||
import 'package:zulip/api/backoff.dart'; | ||
|
||
import 'model.dart'; | ||
|
||
/// Fetch message contents from the specified Zulip server in bulk. | ||
/// | ||
/// It outputs JSON entries of the message IDs and the rendered HTML contents in | ||
/// JSON Lines (https://jsonlines.org) format. The output can be used later to | ||
/// perform checks for discovering unimplemented features. | ||
/// | ||
/// Because message IDs are only unique within a single server, it is encouraged | ||
/// to store corpuses from each server separately to avoid confusion when | ||
/// identifying messages. | ||
/// | ||
/// See tools/content/unimplemented_features_test.dart for more details. | ||
void main(List<String> args) async { | ||
final argParser = ArgParser(); | ||
argParser.addOption( | ||
'email', | ||
help: 'The email. See https://zulip.com/api/api-keys for help.', | ||
mandatory: true, | ||
); | ||
argParser.addOption( | ||
'api-key', | ||
help: 'The API key. See https://zulip.com/api/api-keys for help.', | ||
mandatory: true, | ||
); | ||
argParser.addOption( | ||
'site', | ||
help: 'The URL of the Zulip server to fetch messages from.', | ||
valueHelp: 'https://example.zulip.com', | ||
mandatory: true, | ||
); | ||
argParser.addOption( | ||
'file', | ||
help: 'The file to output the messages to. If not given, write output to\n' | ||
'stdout. Otherwise, if the file exists, its format should match the\n' | ||
'output of the program. This will first read from the file to avoid\n' | ||
'duplicates, by fetching messages starting from the newest/oldest\n' | ||
'known message, then append the output to the end of the file.', | ||
valueHelp: 'path/to/czo.jsonl', | ||
); | ||
argParser.addOption( | ||
'count', | ||
defaultsTo: '100', | ||
help: 'The total number of messages to fetch.', | ||
); | ||
argParser.addFlag( | ||
'fetch-newer', | ||
help: 'Fetch newer messages instead of older ones.\n' | ||
'Only useful when --file is supplied.', | ||
defaultsTo: false, | ||
); | ||
argParser.addFlag( | ||
'help', abbr: 'h', | ||
negatable: false, | ||
help: 'Show this help message.', | ||
); | ||
|
||
void printUsage() { | ||
// Give it a pass when printing the help message. | ||
// ignore: avoid_print | ||
print('usage: fetch_messages --email <EMAIL> --api-key <API_KEY> --site <SERVER_URL>\n\n' | ||
'Fetch message contents from the specified Zulip server in bulk.\n\n' | ||
'${argParser.usage}'); | ||
} | ||
|
||
Never throwWithUsage(String error) { | ||
printUsage(); | ||
throw Exception('\nError: $error'); | ||
} | ||
|
||
final parsedArguments = argParser.parse(args); | ||
if (parsedArguments['help'] as bool) { | ||
printUsage(); | ||
exit(0); | ||
} | ||
|
||
final email = parsedArguments['email'] as String?; | ||
if (email == null) throwWithUsage('Option email is required'); | ||
|
||
final apiKey = parsedArguments['api-key'] as String?; | ||
if (apiKey == null) throwWithUsage('Option api-key is required'); | ||
|
||
final realmUrlStr = parsedArguments['site'] as String?; | ||
if (realmUrlStr == null) throwWithUsage('Option site is required'); | ||
final realmUrl = Uri.parse(realmUrlStr); | ||
|
||
final count = int.parse(parsedArguments['count'] as String); | ||
|
||
final outputPath = parsedArguments['file'] as String?; | ||
final fetchNewer = parsedArguments['fetch-newer'] as bool; | ||
int? anchorMessageId; | ||
IOSink output = stdout; | ||
if (outputPath != null) { | ||
final outputFile = File(outputPath); | ||
if (!outputFile.existsSync()) { | ||
outputFile.createSync(); | ||
} | ||
await for (final message in readMessagesFromJsonl(outputFile)) { | ||
// Find the newest/oldest message ID as the anchor. | ||
anchorMessageId ??= message.id; | ||
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId); | ||
} | ||
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend); | ||
} | ||
|
||
final client = Client(); | ||
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}'; | ||
|
||
// These are working constants chosen abitrarily. | ||
const batchSize = 5000; | ||
const maxRetries = 10; | ||
const fetchInterval = Duration(seconds: 5); | ||
|
||
int retries = 0; | ||
int messageToFetch = count; | ||
BackoffMachine? backoff; | ||
|
||
while (messageToFetch > 0) { | ||
// Fetch messages in batches from newer messages to older messages by | ||
// default, until there aren't any more messages to be fetched. | ||
// Note that newer Zulip messages have higher IDs. | ||
final currentBatchSize = (batchSize < messageToFetch) ? batchSize : messageToFetch; | ||
final _GetMessagesResult result; | ||
try { | ||
result = await _getMessages(client, realmUrl: realmUrl, | ||
authHeader: authHeader, | ||
anchorMessageId: anchorMessageId, | ||
numBefore: (!fetchNewer) ? currentBatchSize : 0, | ||
numAfter: (fetchNewer) ? currentBatchSize : 0, | ||
); | ||
} catch (e) { | ||
// We could have more fine-grained error handling and avoid retrying on | ||
// non-network-related failures, but that's skipped for now. | ||
if (retries >= maxRetries) { | ||
rethrow; | ||
} | ||
retries++; | ||
await (backoff ??= BackoffMachine()).wait(); | ||
continue; | ||
} | ||
|
||
final messageEntries = result.messages.map(MessageEntry.fromRawMessage); | ||
if (messageEntries.isEmpty) { | ||
if (fetchNewer) assert(result.foundNewest); | ||
if (!fetchNewer) assert(result.foundOldest); | ||
break; | ||
} | ||
|
||
// Find and use the newest/oldest message as the next message fetch anchor. | ||
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min); | ||
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json)); | ||
messageToFetch -= messageEntries.length; | ||
|
||
// This I/O operation could fail, but crashing is fine here. | ||
final flushFuture = output.flush(); | ||
// Make sure the delay happens concurrently to the flush. | ||
if (messageToFetch > 0) await Future<void>.delayed(fetchInterval); | ||
await flushFuture; | ||
backoff = null; | ||
} | ||
exit(0); | ||
} | ||
|
||
/// https://zulip.com/api/get-messages#response | ||
// Ported from [GetMessagesResult] to avoid depending on Flutter libraries. | ||
class _GetMessagesResult { | ||
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages); | ||
|
||
final bool foundOldest; | ||
final bool foundNewest; | ||
final List<Map<String, Object?>> messages; | ||
|
||
factory _GetMessagesResult.fromJson(Map<String, Object?> json) => | ||
_GetMessagesResult( | ||
json['found_oldest'] as bool, | ||
json['found_newest'] as bool, | ||
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList()); | ||
} | ||
|
||
Future<_GetMessagesResult> _getMessages(Client client, { | ||
required Uri realmUrl, | ||
required String authHeader, | ||
required int numBefore, | ||
required int numAfter, | ||
int? anchorMessageId, | ||
}) async { | ||
final url = realmUrl.replace( | ||
path: '/api/v1/messages', | ||
queryParameters: { | ||
// This fallback will be used when there is no file given, | ||
// and there is no known messages. | ||
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest', | ||
// A known anchor message already exists in the output, | ||
// so avoid fetching it again. | ||
'include_anchor': jsonEncode(anchorMessageId == null), | ||
'num_before': jsonEncode(numBefore), | ||
'num_after': jsonEncode(numAfter), | ||
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]), | ||
}); | ||
final response = await client.send( | ||
Request('GET', url)..headers['Authorization'] = authHeader); | ||
final bytes = await response.stream.toBytes(); | ||
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?; | ||
|
||
if (response.statusCode != 200 || json == null) { | ||
// We could handle rate limiting or other error codes, but just crashing | ||
// early here should be fine for this tool. | ||
throw Exception('Failed to get messages. Code: ${response.statusCode}\nDetails: ${json ?? 'unknown'}'); | ||
} | ||
return _GetMessagesResult.fromJson(json); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import 'dart:io'; | ||
import 'dart:convert'; | ||
|
||
import 'package:json_annotation/json_annotation.dart'; | ||
|
||
/// A data structure representing a message. | ||
@JsonSerializable() | ||
final class MessageEntry { | ||
const MessageEntry({ | ||
required this.id, | ||
required this.html, | ||
}); | ||
|
||
/// Selectively parses from get-message responses. | ||
/// | ||
/// See also: https://zulip.com/api/get-messages#response | ||
factory MessageEntry.fromRawMessage(Map<String, Object?> json) => | ||
MessageEntry(id: (json['id'] as num).toInt(), html: json['content'] as String); | ||
|
||
factory MessageEntry.fromJson(Map<String, Object?> json) => | ||
MessageEntry(id: (json['id'] as num).toInt(), html: json['html'] as String); | ||
|
||
Map<String, Object> toJson() => {'id': id, 'html': html}; | ||
|
||
/// The message ID, unique within a server. | ||
final int id; | ||
|
||
/// The rendered HTML of the message. | ||
final String html; | ||
} | ||
|
||
/// Open the given JSON Lines file and read [MessageEntry] from it. | ||
/// | ||
/// We store the entries in JSON Lines format and return them from a stream to | ||
/// avoid excessive use of memory. | ||
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead() | ||
.transform(utf8.decoder).transform(const LineSplitter()) | ||
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
@Timeout(Duration(minutes: 10)) | ||
library; | ||
|
||
import 'dart:io'; | ||
import 'dart:math'; | ||
|
||
import 'package:checks/checks.dart'; | ||
import 'package:html/dom.dart' as dom; | ||
import 'package:flutter/foundation.dart'; | ||
import 'package:flutter_test/flutter_test.dart'; | ||
import 'package:zulip/model/content.dart'; | ||
|
||
import 'model.dart'; | ||
|
||
|
||
/// Check if there are unimplemented features from the given corpuses of HTML | ||
/// contents from Zulip messages. | ||
/// | ||
/// This test is meant to be manually run. | ||
/// | ||
/// To run it, use: | ||
/// | ||
/// flutter test tools/content --dart-define=corpusDir=path/to/corpusDir | ||
/// | ||
/// where `path/to/corpusDir` should be a directory containing files with | ||
/// outputs generated from tools/content/fetch_messages.dart. | ||
/// | ||
/// Optionally, you can enable more details with `--dart-define=verbose=true`. | ||
/// | ||
/// The test writes an overview of unimplemented features at the beginning to | ||
/// standard output, followed by the details of each feature. To look for live | ||
/// examples, you can search on the Zulip community by message ID from all | ||
/// public channels. | ||
/// | ||
/// For example, a search query like "near: 12345 channels: public" would work. | ||
/// | ||
/// See also: | ||
/// * lib/model/content.dart, the implementation of the content parser. | ||
/// * tools/content/fetch_messages.dart, the script that produces the corpuses. | ||
void main() async { | ||
Future<void> checkForUnimplementedFeatureInFile(File file) async { | ||
final messageIdsByFeature = <String, Set<int>>{}; | ||
final contentsByFeature = <String, List<String>>{}; | ||
|
||
await for (final message in readMessagesFromJsonl(file)) { | ||
_walk(message.id, parseContent(message.html).toDiagnosticsNode(), | ||
messageIdsByFeature: messageIdsByFeature, | ||
contentsByFeature: contentsByFeature); | ||
} | ||
|
||
// This buffer allows us to avoid using prints directly. | ||
final outputLines = <String>[]; | ||
if (messageIdsByFeature.isNotEmpty) outputLines.add('Found unimplemented features:'); | ||
for (final featureName in messageIdsByFeature.keys) { | ||
Set<int> messageIds = messageIdsByFeature[featureName]!; | ||
int oldestId = messageIds.reduce(min); | ||
int newestId = messageIds.reduce(max); | ||
outputLines.add('- `$featureName`\n Oldest message: $oldestId; newest message: $newestId\n'); | ||
} | ||
outputLines.add(''); | ||
|
||
final divider = '\n\n${'=' * 80}\n\n'; | ||
int unsupportedCounter = 0; | ||
for (final MapEntry(key: featureName, value: messageContents) in contentsByFeature.entries) { | ||
unsupportedCounter++; | ||
if (!_verbose) continue; | ||
outputLines.addAll([ | ||
'Unsupported feature #$unsupportedCounter: $featureName', | ||
'message IDs:\n${messageIdsByFeature[featureName]!.join(', ')}', | ||
'first 10 examples:\n${messageContents.take(10).join(divider)}', | ||
'\n', | ||
]); | ||
} | ||
check(unsupportedCounter, because: outputLines.join('\n')).equals(0); | ||
} | ||
|
||
final corpusFiles = _getCorpusFiles(); | ||
group('Check for unimplemented features in', () { | ||
for (final file in corpusFiles) { | ||
test(file.path, () => checkForUnimplementedFeatureInFile(file)); | ||
} | ||
}, skip: corpusFiles.isEmpty); | ||
} | ||
|
||
// Determine whether details about all messages with unimplemented features | ||
// should be printed. | ||
const bool _verbose = bool.fromEnvironment('verbose'); | ||
|
||
const String _corpusDirPath = String.fromEnvironment('corpusDir'); | ||
|
||
Iterable<File> _getCorpusFiles() { | ||
final corpusDir = Directory(_corpusDirPath); | ||
return corpusDir.existsSync() ? corpusDir.listSync().whereType<File>() : []; | ||
} | ||
|
||
/// Walk the tree looking for unimplemented nodes, and aggregate them by the | ||
/// category of the unimplemented feature. | ||
/// | ||
/// This modifies `messageIdsByFeature` and `contentsByFeature` in-place. | ||
void _walk(int messageId, DiagnosticsNode node, { | ||
required Map<String, Set<int>> messageIdsByFeature, | ||
required Map<String, List<String>> contentsByFeature, | ||
}) { | ||
final value = node.value; | ||
if (value is! UnimplementedNode) { | ||
for (final child in node.getChildren()) { | ||
_walk(messageId, child, | ||
messageIdsByFeature: messageIdsByFeature, | ||
contentsByFeature: contentsByFeature); | ||
} | ||
return; | ||
} | ||
|
||
// `featureName` is a prettified identifier used for categorizing | ||
// unimplemented features that are likely closely related. | ||
final String featureName; | ||
final htmlNode = value.debugHtmlNode; | ||
if (htmlNode is dom.Element) { | ||
if (htmlNode.className.isEmpty) { | ||
featureName = '<${htmlNode.localName!}>'; | ||
} else { | ||
featureName = '<${htmlNode.localName!} class="${htmlNode.classes.join(" ")}">'; | ||
} | ||
} else { | ||
featureName = 'DOM node type: ${htmlNode.nodeType}'; | ||
} | ||
(messageIdsByFeature[featureName] ??= {}).add(messageId); | ||
(contentsByFeature[featureName] ??= []).add(value.debugHtmlText); | ||
} |