-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
wip; Support systematically surveying unimplemented content features.
Signed-off-by: Zixuan James Li <[email protected]>
- Loading branch information
Showing
4 changed files
with
382 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"id": 1, "html": "<p>asd</p>"} | ||
{"id": 1, "html": "<p><span class=\"topic-mention\">@topic</span>>!</p>"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
#!/usr/bin/env dart | ||
|
||
import 'dart:convert'; | ||
import 'dart:io'; | ||
import 'dart:math'; | ||
|
||
// Avoid any Flutter-related dependencies so this can be run in the CLI. | ||
import 'package:args/args.dart'; | ||
import 'package:http/http.dart'; | ||
import 'package:zulip/api/backoff.dart'; | ||
|
||
import 'model.dart'; | ||
|
||
/// Fetch message contents from the specified server in bulk. | ||
/// | ||
/// It outputs the message IDs and the rendered HTML contents in | ||
/// .jsonl (https://jsonlines.org) format, which can be used later | ||
/// to perform checks for discovering unimplemented features. | ||
/// | ||
/// See tools/content/unimplemented_features_test.dart for more details. | ||
void main(List<String> args) async { | ||
final argParser = ArgParser(); | ||
argParser.addOption( | ||
'email', | ||
help: 'The email. See https://zulip.com/api/api-keys for help.', | ||
mandatory: true, | ||
); | ||
argParser.addOption( | ||
'api-key', | ||
help: 'The API key. See https://zulip.com/api/api-keys for help.', | ||
mandatory: true, | ||
); | ||
argParser.addOption( | ||
'site', | ||
help: 'The URL of the Zulip server to fetch messages from.', | ||
valueHelp: 'https://example.zulip.com', | ||
mandatory: true, | ||
); | ||
argParser.addOption( | ||
'file', | ||
help: 'The file to output the messages to. If not given, write output to ' | ||
'stdout. Otherwise, if the file exists, its format should match the ' | ||
'output of the program. This will first read from the file to avoid ' | ||
'duplicates, by fetching messages starting from the newest/oldest ' | ||
'known message, then append the output to the end of the file.', | ||
valueHelp: 'path/to/czo.jsonl', | ||
); | ||
argParser.addOption( | ||
'count', | ||
defaultsTo: '100', | ||
help: 'The total number of messages to fetch.', | ||
); | ||
argParser.addFlag( | ||
'fetch-newer', | ||
help: 'Fetch newer messages instead of older ones. ' | ||
'Only useful when --file is supplied.', | ||
defaultsTo: false, | ||
); | ||
argParser.addFlag( | ||
'help', abbr: 'h', | ||
negatable: false, | ||
help: 'Show this help message.', | ||
); | ||
|
||
void printUsage() { | ||
// ignore: avoid_print | ||
print('Fetch Zulip message contents from a given server.\n' | ||
'Usage: fetch_messages --email <EMAIL> --api-key <API_KEY> --site <SERVER_URL>\n' | ||
'${argParser.usage}'); | ||
} | ||
|
||
Never throwWithUsage(String error) { | ||
printUsage(); | ||
throw Exception('\nError: $error'); | ||
} | ||
|
||
final parsedArguments = argParser.parse(args); | ||
if (parsedArguments['help'] as bool) { | ||
printUsage(); | ||
exit(0); | ||
} | ||
|
||
final email = parsedArguments['email'] as String?; | ||
if (email == null) throwWithUsage('Option email is required'); | ||
|
||
final apiKey = parsedArguments['api-key'] as String?; | ||
if (apiKey == null) throwWithUsage('Option api-key is required'); | ||
|
||
final realmUrlStr = parsedArguments['site'] as String?; | ||
if (realmUrlStr == null) throwWithUsage('Option site is required'); | ||
final realmUrl = Uri.parse(realmUrlStr); | ||
|
||
final count = int.parse(parsedArguments['count'] as String); | ||
|
||
final outputPath = parsedArguments['file'] as String?; | ||
final fetchNewer = parsedArguments['fetch-newer'] as bool; | ||
int? anchorMessageId; | ||
IOSink output = stdout; | ||
if (outputPath != null) { | ||
final outputFile = File(outputPath); | ||
if (!outputFile.existsSync()) { | ||
outputFile.createSync(); | ||
} | ||
await for (final message in readMessagesFromJsonl(outputFile)) { | ||
// Find the newest/oldest message ID as the anchor. | ||
anchorMessageId ??= message.id; | ||
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId); | ||
} | ||
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend); | ||
} | ||
|
||
final client = Client(); | ||
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}'; | ||
|
||
// These are working constants chosen abitrarily. | ||
const batchSize = 5000; | ||
const maxRetries = 10; | ||
const fetchInterval = Duration(seconds: 5); | ||
|
||
int retries = 0; | ||
int messageToFetch = count; | ||
BackoffMachine? backoff; | ||
|
||
while (messageToFetch > 0) { | ||
// Fetch messages in batches, from newer messages to older messages by | ||
// default, until there aren't any more messages to be fetched. Note that | ||
// the message IDs of Zulip messages are higher for newer messages. | ||
final currentBatchSize = (batchSize < messageToFetch) ? batchSize : messageToFetch; | ||
final _GetMessagesResult result; | ||
try { | ||
result = await _getMessages(client, realmUrl: realmUrl, | ||
authHeader: authHeader, | ||
anchorMessageId: anchorMessageId, | ||
numBefore: (!fetchNewer) ? currentBatchSize : 0, | ||
numAfter: (fetchNewer) ? currentBatchSize : 0, | ||
); | ||
} catch (e) { | ||
// We could have more fine-grained error handling and avoid retrying on | ||
// non-network-related failures, but that's skipped for now. | ||
if (retries >= maxRetries) { | ||
rethrow; | ||
} | ||
retries++; | ||
await (backoff ??= BackoffMachine()).wait(); | ||
continue; | ||
} | ||
|
||
final messageEntries = result.messages.map(MessageEntry.fromRawMessage); | ||
if (messageEntries.isEmpty) { | ||
if (fetchNewer) assert(result.foundNewest); | ||
if (!fetchNewer) assert(result.foundOldest); | ||
break; | ||
} | ||
|
||
// Find the newest/oldest message as the next message fetch anchor. | ||
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min); | ||
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json)); | ||
messageToFetch -= messageEntries.length; | ||
|
||
// This I/O operation could fail, but crashing is fine here. | ||
final flushFuture = output.flush(); | ||
// Make sure the delay happens concurrently to the flush. | ||
if (messageToFetch > 0) await Future<void>.delayed(fetchInterval); | ||
await flushFuture; | ||
backoff = null; | ||
} | ||
exit(0); | ||
} | ||
|
||
// Ported from [GetMessagesResult] to avoid depending on Flutter libraries. | ||
class _GetMessagesResult { | ||
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages); | ||
|
||
final bool foundOldest; | ||
final bool foundNewest; | ||
final List<Map<String, Object?>> messages; | ||
|
||
factory _GetMessagesResult.fromJson(Map<String, Object?> json) => | ||
_GetMessagesResult( | ||
json['found_oldest'] as bool, | ||
json['found_newest'] as bool, | ||
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList()); | ||
} | ||
|
||
Future<_GetMessagesResult> _getMessages(Client client, { | ||
required Uri realmUrl, | ||
required String authHeader, | ||
required int numBefore, | ||
required int numAfter, | ||
int? anchorMessageId, | ||
}) async { | ||
final url = realmUrl.replace( | ||
path: '/api/v1/messages', | ||
queryParameters: { | ||
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest', | ||
// A known anchor message already exists in the output, | ||
// so avoid fetching it again. | ||
'include_anchor': jsonEncode(anchorMessageId == null), | ||
'num_before': jsonEncode(numBefore), | ||
'num_after': jsonEncode(numAfter), | ||
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]), | ||
}); | ||
final StreamedResponse response; | ||
response = await client.send( | ||
Request('GET', url)..headers['Authorization'] = authHeader); | ||
final bytes = await response.stream.toBytes(); | ||
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?; | ||
|
||
if (response.statusCode != 200 || json == null) { | ||
// We could handle rate limiting or other error codes, but just crashing | ||
// early here should be fine for this tool. | ||
throw Exception('Failed to get messages. Code: ${response.statusCode}\nDetails: ${json ?? 'unknown'}'); | ||
} | ||
return _GetMessagesResult.fromJson(json); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import 'dart:io'; | ||
import 'dart:convert'; | ||
|
||
import 'package:json_annotation/json_annotation.dart'; | ||
|
||
/// A data structure representing a message. | ||
@JsonSerializable() | ||
final class MessageEntry { | ||
const MessageEntry({ | ||
required this.id, | ||
required this.html, | ||
}); | ||
|
||
/// Selectively parses from get-message responses. | ||
/// | ||
/// See also: https://zulip.com/api/get-messages#response | ||
factory MessageEntry.fromRawMessage(Map<String, Object?> json) => | ||
MessageEntry(id: (json['id'] as num).toInt(), html: json['content'] as String); | ||
|
||
factory MessageEntry.fromJson(Map<String, Object?> json) => | ||
MessageEntry(id: (json['id'] as num).toInt(), html: json['html'] as String); | ||
|
||
Map<String, Object> toJson() => {'id': id, 'html': html}; | ||
|
||
/// The message ID, unique within a server. | ||
final int id; | ||
|
||
/// The rendered HTML of the message. | ||
final String html; | ||
} | ||
|
||
/// Open the given JSON Lines file and read [MessageEntry] from it. | ||
/// | ||
/// We store the entries in JSON Lines format and return them from a stream to | ||
/// avoid excessive use of memory. | ||
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead() | ||
.transform(utf8.decoder).transform(const LineSplitter()) | ||
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
@Timeout(Duration(minutes: 10)) | ||
library; | ||
|
||
import 'dart:io'; | ||
import 'dart:math'; | ||
|
||
import 'package:checks/checks.dart'; | ||
import 'package:html/dom.dart' as dom; | ||
import 'package:flutter/foundation.dart'; | ||
import 'package:flutter_test/flutter_test.dart'; | ||
import 'package:zulip/model/content.dart'; | ||
|
||
import 'model.dart'; | ||
|
||
|
||
/// Check if there are unimplemented features from the given corpuses of HTML | ||
/// contents from Zulip messages. | ||
/// | ||
/// This test is meant to be run manually | ||
/// | ||
/// To run it, use: | ||
/// | ||
/// flutter test tools/content --dart-define=corpusDir=path/to/corpusDir | ||
/// | ||
/// where `path/to/corpusDir` should be a directory containing files with | ||
/// outputs generated from tools/content/fetch_messages.dart. | ||
/// | ||
/// Optionally, you can enable more details with `--dart-define=verbose=true`. | ||
/// | ||
/// The test writes an overview of unimplemented features at the beginning to | ||
/// standard output, followed by the details of each feature. To look for live | ||
/// examples, you can search on the Zulip community by message ID from all | ||
/// public channels. | ||
/// | ||
/// For example, a search query like "near: 12345 channels: public" would work. | ||
/// | ||
/// See also: | ||
/// * lib/model/content.dart, the implementation of the content parser. | ||
/// * tools/content/fetch_messages.dart, the script that produces the corpuses. | ||
void main() async { | ||
Future<void> checkForUnimplementedFeatureInFile(File file) async { | ||
final messageIdsByFeature = <String, Set<int>>{}; | ||
final contentsByFeature = <String, List<String>>{}; | ||
|
||
await for (final message in readMessagesFromJsonl(file)) { | ||
_walk(message.id, parseContent(message.html).toDiagnosticsNode(), | ||
messageIdsByFeature: messageIdsByFeature, | ||
contentsByFeature: contentsByFeature); | ||
} | ||
|
||
// This buffer allows us to avoid using prints directly. | ||
final outputLines = <String>[]; | ||
if (messageIdsByFeature.isNotEmpty) outputLines.add('Found unimplemented features:'); | ||
for (final featureName in messageIdsByFeature.keys) { | ||
Set<int> messageIds = messageIdsByFeature[featureName]!; | ||
int oldestId = messageIds.reduce(min); | ||
int newestId = messageIds.reduce(max); | ||
outputLines.add('- `$featureName`\n Oldest message: $oldestId; newest message: $newestId\n'); | ||
} | ||
outputLines.add(''); | ||
|
||
final divider = '\n\n${'=' * 80}\n\n'; | ||
int unsupportedCounter = 0; | ||
for (final MapEntry(key: featureName, value: messageContents) in contentsByFeature.entries) { | ||
unsupportedCounter++; | ||
if (!_verbose) continue; | ||
outputLines.addAll([ | ||
'Unsupported feature #$unsupportedCounter: $featureName', | ||
'message IDs:\n${messageIdsByFeature[featureName]!.join(', ')}', | ||
'first 10 examples:\n${messageContents.take(10).join(divider)}', | ||
'\n', | ||
]); | ||
} | ||
check(unsupportedCounter, because: outputLines.join('\n')).equals(0); | ||
} | ||
|
||
final corpusFiles = _getCorpusFiles(); | ||
group('Check for unimplemented features in', () { | ||
for (final file in corpusFiles) { | ||
test(file.path, () => checkForUnimplementedFeatureInFile(file)); | ||
} | ||
}, skip: corpusFiles.isEmpty); | ||
} | ||
|
||
// Determine whether details about all messages with unimplemented features | ||
// should be printed. | ||
const bool _verbose = bool.fromEnvironment('verbose'); | ||
|
||
const String _corpusDirPath = String.fromEnvironment('corpusDir'); | ||
|
||
Iterable<File> _getCorpusFiles() { | ||
final corpusDir = Directory(_corpusDirPath); | ||
return corpusDir.existsSync() ? corpusDir.listSync().whereType<File>() : []; | ||
} | ||
|
||
/// Walk the tree looking for unimplemented nodes, and aggregate them by the | ||
/// category of the unimplemented feature. | ||
/// | ||
/// This modifies `messageIdsByFeature` and `contentsByFeature` in-place. | ||
void _walk(int messageId, DiagnosticsNode node, { | ||
required Map<String, Set<int>> messageIdsByFeature, | ||
required Map<String, List<String>> contentsByFeature, | ||
}) { | ||
final value = node.value; | ||
if (value is! UnimplementedNode) { | ||
for (final child in node.getChildren()) { | ||
_walk(messageId, child, | ||
messageIdsByFeature: messageIdsByFeature, | ||
contentsByFeature: contentsByFeature); | ||
} | ||
return; | ||
} | ||
|
||
final htmlNode = value.debugHtmlNode; | ||
final String featureName; | ||
if (htmlNode is dom.Element) { | ||
if (htmlNode.className.isEmpty) { | ||
featureName = '<${htmlNode.localName!}>'; | ||
} else { | ||
featureName = '<${htmlNode.localName!} class="${htmlNode.classes.join(" ")}">'; | ||
} | ||
} else { | ||
featureName = 'DOM node type: ${htmlNode.nodeType}'; | ||
} | ||
(messageIdsByFeature[featureName] ??= {}).add(messageId); | ||
(contentsByFeature[featureName] ??= []).add(value.debugHtmlText); | ||
} |