Skip to content

Commit

Permalink
wip; Support systematically surveying unimplemented content features.
Browse files Browse the repository at this point in the history
Signed-off-by: Zixuan James Li <[email protected]>
  • Loading branch information
PIG208 committed Aug 29, 2024
1 parent 7b24f6d commit 5728c5f
Show file tree
Hide file tree
Showing 4 changed files with 382 additions and 0 deletions.
2 changes: 2 additions & 0 deletions message_fixtures/chat.zulip.org.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": 1, "html": "<p>asd</p>"}
{"id": 1, "html": "<p><span class=\"topic-mention\">@topic</span>>!</p>"}
215 changes: 215 additions & 0 deletions tools/content/fetch_messages.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/env dart

import 'dart:convert';
import 'dart:io';
import 'dart:math';

// Avoid any Flutter-related dependencies so this can be run in the CLI.
import 'package:args/args.dart';
import 'package:http/http.dart';
import 'package:zulip/api/backoff.dart';

import 'model.dart';

/// Fetch message contents from the specified server in bulk.
///
/// It outputs the message IDs and the rendered HTML contents in
/// .jsonl (https://jsonlines.org) format, which can be used later
/// to perform checks for discovering unimplemented features.
///
/// See tools/content/unimplemented_features_test.dart for more details.
void main(List<String> args) async {
final argParser = ArgParser();
argParser.addOption(
'email',
help: 'The email. See https://zulip.com/api/api-keys for help.',
mandatory: true,
);
argParser.addOption(
'api-key',
help: 'The API key. See https://zulip.com/api/api-keys for help.',
mandatory: true,
);
argParser.addOption(
'site',
help: 'The URL of the Zulip server to fetch messages from.',
valueHelp: 'https://example.zulip.com',
mandatory: true,
);
argParser.addOption(
'file',
help: 'The file to output the messages to. If not given, write output to '
'stdout. Otherwise, if the file exists, its format should match the '
'output of the program. This will first read from the file to avoid '
'duplicates, by fetching messages starting from the newest/oldest '
'known message, then append the output to the end of the file.',
valueHelp: 'path/to/czo.jsonl',
);
argParser.addOption(
'count',
defaultsTo: '100',
help: 'The total number of messages to fetch.',
);
argParser.addFlag(
'fetch-newer',
help: 'Fetch newer messages instead of older ones. '
'Only useful when --file is supplied.',
defaultsTo: false,
);
argParser.addFlag(
'help', abbr: 'h',
negatable: false,
help: 'Show this help message.',
);

void printUsage() {
// ignore: avoid_print
print('Fetch Zulip message contents from a given server.\n'
'Usage: fetch_messages --email <EMAIL> --api-key <API_KEY> --site <SERVER_URL>\n'
'${argParser.usage}');
}

Never throwWithUsage(String error) {
printUsage();
throw Exception('\nError: $error');
}

final parsedArguments = argParser.parse(args);
if (parsedArguments['help'] as bool) {
printUsage();
exit(0);
}

final email = parsedArguments['email'] as String?;
if (email == null) throwWithUsage('Option email is required');

final apiKey = parsedArguments['api-key'] as String?;
if (apiKey == null) throwWithUsage('Option api-key is required');

final realmUrlStr = parsedArguments['site'] as String?;
if (realmUrlStr == null) throwWithUsage('Option site is required');
final realmUrl = Uri.parse(realmUrlStr);

final count = int.parse(parsedArguments['count'] as String);

final outputPath = parsedArguments['file'] as String?;
final fetchNewer = parsedArguments['fetch-newer'] as bool;
int? anchorMessageId;
IOSink output = stdout;
if (outputPath != null) {
final outputFile = File(outputPath);
if (!outputFile.existsSync()) {
outputFile.createSync();
}
await for (final message in readMessagesFromJsonl(outputFile)) {
// Find the newest/oldest message ID as the anchor.
anchorMessageId ??= message.id;
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
}
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);
}

final client = Client();
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';

// These are working constants chosen abitrarily.
const batchSize = 5000;
const maxRetries = 10;
const fetchInterval = Duration(seconds: 5);

int retries = 0;
int messageToFetch = count;
BackoffMachine? backoff;

while (messageToFetch > 0) {
// Fetch messages in batches, from newer messages to older messages by
// default, until there aren't any more messages to be fetched. Note that
// the message IDs of Zulip messages are higher for newer messages.
final currentBatchSize = (batchSize < messageToFetch) ? batchSize : messageToFetch;
final _GetMessagesResult result;
try {
result = await _getMessages(client, realmUrl: realmUrl,
authHeader: authHeader,
anchorMessageId: anchorMessageId,
numBefore: (!fetchNewer) ? currentBatchSize : 0,
numAfter: (fetchNewer) ? currentBatchSize : 0,
);
} catch (e) {
// We could have more fine-grained error handling and avoid retrying on
// non-network-related failures, but that's skipped for now.
if (retries >= maxRetries) {
rethrow;
}
retries++;
await (backoff ??= BackoffMachine()).wait();
continue;
}

final messageEntries = result.messages.map(MessageEntry.fromRawMessage);
if (messageEntries.isEmpty) {
if (fetchNewer) assert(result.foundNewest);
if (!fetchNewer) assert(result.foundOldest);
break;
}

// Find the newest/oldest message as the next message fetch anchor.
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));
messageToFetch -= messageEntries.length;

// This I/O operation could fail, but crashing is fine here.
final flushFuture = output.flush();
// Make sure the delay happens concurrently to the flush.
if (messageToFetch > 0) await Future<void>.delayed(fetchInterval);
await flushFuture;
backoff = null;
}
exit(0);
}

// Ported from [GetMessagesResult] to avoid depending on Flutter libraries.
class _GetMessagesResult {
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);

final bool foundOldest;
final bool foundNewest;
final List<Map<String, Object?>> messages;

factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
_GetMessagesResult(
json['found_oldest'] as bool,
json['found_newest'] as bool,
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
}

Future<_GetMessagesResult> _getMessages(Client client, {
required Uri realmUrl,
required String authHeader,
required int numBefore,
required int numAfter,
int? anchorMessageId,
}) async {
final url = realmUrl.replace(
path: '/api/v1/messages',
queryParameters: {
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest',
// A known anchor message already exists in the output,
// so avoid fetching it again.
'include_anchor': jsonEncode(anchorMessageId == null),
'num_before': jsonEncode(numBefore),
'num_after': jsonEncode(numAfter),
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
});
final StreamedResponse response;
response = await client.send(
Request('GET', url)..headers['Authorization'] = authHeader);
final bytes = await response.stream.toBytes();
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;

if (response.statusCode != 200 || json == null) {
// We could handle rate limiting or other error codes, but just crashing
// early here should be fine for this tool.
throw Exception('Failed to get messages. Code: ${response.statusCode}\nDetails: ${json ?? 'unknown'}');
}
return _GetMessagesResult.fromJson(json);
}
38 changes: 38 additions & 0 deletions tools/content/model.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import 'dart:io';
import 'dart:convert';

import 'package:json_annotation/json_annotation.dart';

/// A data structure representing a message.
@JsonSerializable()
final class MessageEntry {
const MessageEntry({
required this.id,
required this.html,
});

/// Selectively parses from get-message responses.
///
/// See also: https://zulip.com/api/get-messages#response
factory MessageEntry.fromRawMessage(Map<String, Object?> json) =>
MessageEntry(id: (json['id'] as num).toInt(), html: json['content'] as String);

factory MessageEntry.fromJson(Map<String, Object?> json) =>
MessageEntry(id: (json['id'] as num).toInt(), html: json['html'] as String);

Map<String, Object> toJson() => {'id': id, 'html': html};

/// The message ID, unique within a server.
final int id;

/// The rendered HTML of the message.
final String html;
}

/// Open the given JSON Lines file and read [MessageEntry] from it.
///
/// We store the entries in JSON Lines format and return them from a stream to
/// avoid excessive use of memory.
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead()
.transform(utf8.decoder).transform(const LineSplitter())
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>));
127 changes: 127 additions & 0 deletions tools/content/unimplemented_features_test.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
@Timeout(Duration(minutes: 10))
library;

import 'dart:io';
import 'dart:math';

import 'package:checks/checks.dart';
import 'package:html/dom.dart' as dom;
import 'package:flutter/foundation.dart';
import 'package:flutter_test/flutter_test.dart';
import 'package:zulip/model/content.dart';

import 'model.dart';


/// Check if there are unimplemented features from the given corpuses of HTML
/// contents from Zulip messages.
///
/// This test is meant to be run manually
///
/// To run it, use:
///
/// flutter test tools/content --dart-define=corpusDir=path/to/corpusDir
///
/// where `path/to/corpusDir` should be a directory containing files with
/// outputs generated from tools/content/fetch_messages.dart.
///
/// Optionally, you can enable more details with `--dart-define=verbose=true`.
///
/// The test writes an overview of unimplemented features at the beginning to
/// standard output, followed by the details of each feature. To look for live
/// examples, you can search on the Zulip community by message ID from all
/// public channels.
///
/// For example, a search query like "near: 12345 channels: public" would work.
///
/// See also:
/// * lib/model/content.dart, the implementation of the content parser.
/// * tools/content/fetch_messages.dart, the script that produces the corpuses.
void main() async {
Future<void> checkForUnimplementedFeatureInFile(File file) async {
final messageIdsByFeature = <String, Set<int>>{};
final contentsByFeature = <String, List<String>>{};

await for (final message in readMessagesFromJsonl(file)) {
_walk(message.id, parseContent(message.html).toDiagnosticsNode(),
messageIdsByFeature: messageIdsByFeature,
contentsByFeature: contentsByFeature);
}

// This buffer allows us to avoid using prints directly.
final outputLines = <String>[];
if (messageIdsByFeature.isNotEmpty) outputLines.add('Found unimplemented features:');
for (final featureName in messageIdsByFeature.keys) {
Set<int> messageIds = messageIdsByFeature[featureName]!;
int oldestId = messageIds.reduce(min);
int newestId = messageIds.reduce(max);
outputLines.add('- `$featureName`\n Oldest message: $oldestId; newest message: $newestId\n');
}
outputLines.add('');

final divider = '\n\n${'=' * 80}\n\n';
int unsupportedCounter = 0;
for (final MapEntry(key: featureName, value: messageContents) in contentsByFeature.entries) {
unsupportedCounter++;
if (!_verbose) continue;
outputLines.addAll([
'Unsupported feature #$unsupportedCounter: $featureName',
'message IDs:\n${messageIdsByFeature[featureName]!.join(', ')}',
'first 10 examples:\n${messageContents.take(10).join(divider)}',
'\n',
]);
}
check(unsupportedCounter, because: outputLines.join('\n')).equals(0);
}

final corpusFiles = _getCorpusFiles();
group('Check for unimplemented features in', () {
for (final file in corpusFiles) {
test(file.path, () => checkForUnimplementedFeatureInFile(file));
}
}, skip: corpusFiles.isEmpty);
}

// Determine whether details about all messages with unimplemented features
// should be printed.
const bool _verbose = bool.fromEnvironment('verbose');

const String _corpusDirPath = String.fromEnvironment('corpusDir');

Iterable<File> _getCorpusFiles() {
final corpusDir = Directory(_corpusDirPath);
return corpusDir.existsSync() ? corpusDir.listSync().whereType<File>() : [];
}

/// Walk the tree looking for unimplemented nodes, and aggregate them by the
/// category of the unimplemented feature.
///
/// This modifies `messageIdsByFeature` and `contentsByFeature` in-place.
void _walk(int messageId, DiagnosticsNode node, {
required Map<String, Set<int>> messageIdsByFeature,
required Map<String, List<String>> contentsByFeature,
}) {
final value = node.value;
if (value is! UnimplementedNode) {
for (final child in node.getChildren()) {
_walk(messageId, child,
messageIdsByFeature: messageIdsByFeature,
contentsByFeature: contentsByFeature);
}
return;
}

final htmlNode = value.debugHtmlNode;
final String featureName;
if (htmlNode is dom.Element) {
if (htmlNode.className.isEmpty) {
featureName = '<${htmlNode.localName!}>';
} else {
featureName = '<${htmlNode.localName!} class="${htmlNode.classes.join(" ")}">';
}
} else {
featureName = 'DOM node type: ${htmlNode.nodeType}';
}
(messageIdsByFeature[featureName] ??= {}).add(messageId);
(contentsByFeature[featureName] ??= []).add(value.debugHtmlText);
}

0 comments on commit 5728c5f

Please sign in to comment.