Skip to content

Commit

Permalink
File deduplication (#6332)
Browse files Browse the repository at this point in the history
When receiving messages, blobs will be deduplicated with the new
function `create_and_deduplicate_from_bytes()`. For sending files, this
adds a new function `set_file_and_deduplicate()` instead of
deduplicating by default.

This is for
#6265; read the
issue description there for more details.

TODO:
- [x] Set files as read-only
- [x] Don't do a write when the file is already identical
- [x] The first 32 chars or so of the 64-character hash are enough. I
calculated that if 10b people (i.e. all of humanity) use DC, and each of
them has 200k distinct blob files (I have 4k in my day-to-day account),
and we used 20 chars, then the expected value for the number of name
collisions would be ~0.0002 (and the probability that there is a least
one name collision is lower than that) [^1]. I added 12 more characters
to be on the super safe side, but this wouldn't be necessary and I could
also make it 20 instead of 32.
- Not 100% sure whether that's necessary at all - it would mainly be
necessary if we might hit a length limit on some file systems (the
blobdir is usually sth like
`accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus
64 chars for the filename would be 117).
- [x] "touch" the files to prevent them from being deleted
- [x] TODOs in the code

For later PRs:
- Replace `BlobObject::create(…)` with
`BlobObject::create_and_deduplicate(…)` in order to deduplicate
everytime core creates a file
- Modify JsonRPC to deduplicate blob files
- Possibly rename BlobObject.name to BlobObject.file in order to prevent
confusion (because `name` usually means "user-visible-name", not "name
of the file on disk").

[^1]: Calculated with both https://printfn.github.io/fend/ and
https://www.geogebra.org/calculator, both of which came to the same
result
([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d),

[2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252))

---------

Co-authored-by: l <[email protected]>
  • Loading branch information
Hocuri and link2xt authored Jan 21, 2025
1 parent 22a7cfe commit 65a9c4b
Show file tree
Hide file tree
Showing 23 changed files with 582 additions and 239 deletions.
9 changes: 5 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ toml = "0.8"
url = "2"
uuid = { version = "1", features = ["serde", "v4"] }
webpki-roots = "0.26.7"
blake3 = "1.5.5"

[dev-dependencies]
anyhow = { workspace = true, features = ["backtrace"] } # Enable `backtrace` feature in tests.
Expand Down
25 changes: 25 additions & 0 deletions deltachat-ffi/deltachat.h
Original file line number Diff line number Diff line change
Expand Up @@ -4756,6 +4756,31 @@ void dc_msg_set_override_sender_name(dc_msg_t* msg, const char* name)
void dc_msg_set_file (dc_msg_t* msg, const char* file, const char* filemime);


/**
* Sets the file associated with a message.
*
* If `name` is non-null, it is used as the file name
* and the actual current name of the file is ignored.
*
* If the source file is already in the blobdir, it will be renamed,
* otherwise it will be copied to the blobdir first.
*
* In order to deduplicate files that contain the same data,
* the file will be named as a hash of the file data.
*
* NOTE:
* - This function will rename the file. To get the new file path, call `get_file()`.
* - The file must not be modified after this function was called.
*
* @memberof dc_msg_t
* @param msg The message object. Must not be NULL.
* @param file The path of the file to attach. Must not be NULL.
* @param name The original filename of the attachment. If NULL, the current name of `file` will be used instead.
* @param filemime The MIME type of the file. NULL if you don't know or don't care.
*/
void dc_msg_set_file_and_deduplicate(dc_msg_t* msg, const char* file, const char* name, const char* filemime);


/**
* Set the dimensions associated with message object.
* Typically this is the width and the height of an image or video associated using dc_msg_set_file().
Expand Down
27 changes: 27 additions & 0 deletions deltachat-ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3835,6 +3835,33 @@ pub unsafe extern "C" fn dc_msg_set_file(
)
}

#[no_mangle]
pub unsafe extern "C" fn dc_msg_set_file_and_deduplicate(
msg: *mut dc_msg_t,
file: *const libc::c_char,
name: *const libc::c_char,
filemime: *const libc::c_char,
) {
if msg.is_null() || file.is_null() {
eprintln!("ignoring careless call to dc_msg_set_file_and_deduplicate()");
return;
}
let ffi_msg = &mut *msg;
let ctx = &*ffi_msg.context;

ffi_msg
.message
.set_file_and_deduplicate(
ctx,
as_path(file),
to_opt_string_lossy(name).as_deref(),
to_opt_string_lossy(filemime).as_deref(),
)
.context("Failed to set file")
.log_err(&*ffi_msg.context)
.ok();
}

#[no_mangle]
pub unsafe extern "C" fn dc_msg_set_dimension(
msg: *mut dc_msg_t,
Expand Down
7 changes: 5 additions & 2 deletions python/src/deltachat/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ def set_html(self, html_text):

@props.with_doc
def filename(self):
"""filename if there was an attachment, otherwise empty string."""
"""file path if there was an attachment, otherwise empty string.
If you want to get the file extension or a user-visible string,
use `basename` instead."""
return from_dc_charpointer(lib.dc_msg_get_file(self._dc_msg))

def set_file(self, path, mime_type=None):
Expand All @@ -120,7 +122,8 @@ def set_file(self, path, mime_type=None):

@props.with_doc
def basename(self) -> str:
"""basename of the attachment if it exists, otherwise empty string."""
"""The user-visible name of the attachment (incl. extension)
if it exists, otherwise empty string."""
# FIXME, it does not return basename
return from_dc_charpointer(lib.dc_msg_get_filename(self._dc_msg))

Expand Down
15 changes: 8 additions & 7 deletions python/tests/test_1_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,16 @@ def send_and_receive_message():
msg = send_and_receive_message()
assert msg.text == "withfile"
assert open(msg.filename).read() == "some data"
msg.filename.index(basename)
assert msg.filename.endswith(ext)
msg.basename.index(basename)
assert msg.basename.endswith(ext)

msg2 = send_and_receive_message()
assert msg2.text == "withfile"
assert open(msg2.filename).read() == "some data"
msg2.filename.index(basename)
assert msg2.filename.endswith(ext)
assert msg.filename != msg2.filename
msg2.basename.index(basename)
assert msg2.basename.endswith(ext)
assert msg.filename == msg2.filename # The file is deduplicated
assert msg.basename == msg2.basename


def test_send_file_html_attachment(tmp_path, acfactory, lp):
Expand All @@ -214,8 +215,8 @@ def test_send_file_html_attachment(tmp_path, acfactory, lp):
msg = ac2.get_message_by_id(ev.data2)

assert open(msg.filename).read() == content
msg.filename.index(basename)
assert msg.filename.endswith(ext)
msg.basename.index(basename)
assert msg.basename.endswith(ext)


def test_html_message(acfactory, lp):
Expand Down
Loading

0 comments on commit 65a9c4b

Please sign in to comment.