1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
use std::{
    collections::HashMap,
    io::{self, Cursor},
    pin::pin,
    sync::Arc,
    task::Poll,
};

use data_encoding::HEXLOWER;
use fastcdc::v2020::AsyncStreamCDC;
use futures::Future;
use object_store::{path::Path, ObjectStore};
use pin_project_lite::pin_project;
use prost::Message;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_stream::StreamExt;
use tonic::async_trait;
use tracing::{debug, instrument, trace, Level};
use url::Url;

use crate::{
    composition::{CompositionContext, ServiceBuilder},
    proto::{stat_blob_response::ChunkMeta, StatBlobResponse},
    B3Digest, B3HashingReader, Error,
};

use super::{BlobReader, BlobService, BlobWriter, ChunkedReader};

/// Uses any object storage supported by the [object_store] crate to provide a
/// tvix-castore [BlobService].
///
/// # Data format
/// Data is organized in "blobs" and "chunks".
/// Blobs don't hold the actual data, but instead contain a list of more
/// granular chunks that assemble to the contents requested.
/// This allows clients to seek, and not download chunks they already have
/// locally, as it's referred to from other files.
/// Check `rpc_blobstore` and more general BlobStore docs on that.
///
/// ## Blobs
/// Stored at `${base_path}/blobs/b3/$digest_key`. They contains the serialized
/// StatBlobResponse for the blob with the digest.
///
/// ## Chunks
/// Chunks are stored at `${base_path}/chunks/b3/$digest_key`. They contain
/// the literal contents of the chunk, but are zstd-compressed.
///
/// ## Digest key sharding
/// The blake3 digest encoded in lower hex, and sharded after the second
/// character.
/// The blob for "Hello World" is stored at
/// `${base_path}/blobs/b3/41/41f8394111eb713a22165c46c90ab8f0fd9399c92028fd6d288944b23ff5bf76`.
///
/// This reduces the number of files in the same directory, which would be a
/// problem at least when using [object_store::local::LocalFileSystem].
///
/// # Future changes
/// There's no guarantees about this being a final format yet.
/// Once object_store gets support for additional metadata / content-types,
/// we can eliminate some requests (small blobs only consisting of a single
/// chunk can be stored as-is, without the blob index file).
/// It also allows signalling any compression of chunks in the content-type.
/// Migration *should* be possible by simply adding the right content-types to
/// all keys stored so far, but no promises ;-)
#[derive(Clone)]
pub struct ObjectStoreBlobService {
    instance_name: String,
    object_store: Arc<dyn ObjectStore>,
    base_path: Path,

    /// Average chunk size for FastCDC, in bytes.
    /// min value is half, max value double of that number.
    avg_chunk_size: u32,
}

#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))]
fn derive_blob_path(base_path: &Path, digest: &B3Digest) -> Path {
    base_path
        .child("blobs")
        .child("b3")
        .child(HEXLOWER.encode(&digest.as_slice()[..2]))
        .child(HEXLOWER.encode(digest.as_slice()))
}

#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,chunk.digest=%digest),ret(Display))]
fn derive_chunk_path(base_path: &Path, digest: &B3Digest) -> Path {
    base_path
        .child("chunks")
        .child("b3")
        .child(HEXLOWER.encode(&digest.as_slice()[..2]))
        .child(HEXLOWER.encode(digest.as_slice()))
}

#[async_trait]
impl BlobService for ObjectStoreBlobService {
    #[instrument(skip_all, ret(level = Level::TRACE), err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
    async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
        // TODO: clarify if this should work for chunks or not, and explicitly
        // document in the proto docs.
        let p = derive_blob_path(&self.base_path, digest);

        match self.object_store.head(&p).await {
            Ok(_) => Ok(true),
            Err(object_store::Error::NotFound { .. }) => {
                let p = derive_chunk_path(&self.base_path, digest);
                match self.object_store.head(&p).await {
                    Ok(_) => Ok(true),
                    Err(object_store::Error::NotFound { .. }) => Ok(false),
                    Err(e) => Err(e)?,
                }
            }
            Err(e) => Err(e)?,
        }
    }

    #[instrument(skip_all, err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
        // handle reading the empty blob.
        if digest.as_slice() == blake3::hash(b"").as_bytes() {
            return Ok(Some(Box::new(Cursor::new(b"")) as Box<dyn BlobReader>));
        }
        match self
            .object_store
            .get(&derive_chunk_path(&self.base_path, digest))
            .await
        {
            Ok(res) => {
                // handle reading blobs that are small enough to fit inside a single chunk:
                // fetch the entire chunk into memory, decompress, ensure the b3 digest matches,
                // and return a io::Cursor over that data.
                // FUTUREWORK: use zstd::bulk to prevent decompression bombs

                let chunk_raw_bytes = res.bytes().await?;
                let chunk_contents = zstd::stream::decode_all(Cursor::new(chunk_raw_bytes))?;

                if *digest != blake3::hash(&chunk_contents).as_bytes().into() {
                    Err(io::Error::other("chunk contents invalid"))?;
                }

                Ok(Some(Box::new(Cursor::new(chunk_contents))))
            }
            Err(object_store::Error::NotFound { .. }) => {
                // NOTE: For public-facing things, we would want to stop here.
                // Clients should fetch granularly, so they can make use of
                // chunks they have locally.
                // However, if this is used directly, without any caches, do the
                // assembly here.
                // This is subject to change, once we have store composition.
                // TODO: make this configurable, and/or clarify behaviour for
                // the gRPC server surface (explicitly document behaviour in the
                // proto docs)
                if let Some(chunks) = self.chunks(digest).await? {
                    let chunked_reader = ChunkedReader::from_chunks(
                        chunks.into_iter().map(|chunk| {
                            (
                                chunk.digest.try_into().expect("invalid b3 digest"),
                                chunk.size,
                            )
                        }),
                        Arc::new(self.clone()) as Arc<dyn BlobService>,
                    );

                    Ok(Some(Box::new(chunked_reader)))
                } else {
                    // This is neither a chunk nor a blob, return None.
                    Ok(None)
                }
            }
            Err(e) => Err(e.into()),
        }
    }

    #[instrument(skip_all, fields(instance_name=%self.instance_name))]
    async fn open_write(&self) -> Box<dyn BlobWriter> {
        // ObjectStoreBlobWriter implements AsyncWrite, but all the chunking
        // needs an AsyncRead, so we create a pipe here.
        // In its `AsyncWrite` implementation, `ObjectStoreBlobWriter` delegates
        // writes to w. It periodically polls the future that's reading from the
        // other side.
        let (w, r) = tokio::io::duplex(self.avg_chunk_size as usize * 10);

        Box::new(ObjectStoreBlobWriter {
            writer: Some(w),
            fut: Some(Box::pin(chunk_and_upload(
                r,
                self.object_store.clone(),
                self.base_path.clone(),
                self.avg_chunk_size / 2,
                self.avg_chunk_size,
                self.avg_chunk_size * 2,
            ))),
            fut_output: None,
        })
    }

    #[instrument(skip_all, err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
    async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
        match self
            .object_store
            .get(&derive_blob_path(&self.base_path, digest))
            .await
        {
            Ok(get_result) => {
                // fetch the data at the blob path
                let blob_data = get_result.bytes().await?;
                // parse into StatBlobResponse
                let stat_blob_response: StatBlobResponse = StatBlobResponse::decode(blob_data)?;

                debug!(
                    chunk.count = stat_blob_response.chunks.len(),
                    blob.size = stat_blob_response
                        .chunks
                        .iter()
                        .map(|x| x.size)
                        .sum::<u64>(),
                    "found more granular chunks"
                );

                Ok(Some(stat_blob_response.chunks))
            }
            Err(object_store::Error::NotFound { .. }) => {
                // If there's only a chunk, we must return the empty vec here, rather than None.
                match self
                    .object_store
                    .head(&derive_chunk_path(&self.base_path, digest))
                    .await
                {
                    Ok(_) => {
                        // present, but no more chunks available
                        debug!("found a single chunk");
                        Ok(Some(vec![]))
                    }
                    Err(object_store::Error::NotFound { .. }) => {
                        // Neither blob nor single chunk found
                        debug!("not found");
                        Ok(None)
                    }
                    // error checking for chunk
                    Err(e) => Err(e.into()),
                }
            }
            // error checking for blob
            Err(err) => Err(err.into()),
        }
    }
}

fn default_avg_chunk_size() -> u32 {
    256 * 1024
}

#[derive(serde::Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ObjectStoreBlobServiceConfig {
    object_store_url: String,
    #[serde(default = "default_avg_chunk_size")]
    avg_chunk_size: u32,
    object_store_options: HashMap<String, String>,
}

impl TryFrom<url::Url> for ObjectStoreBlobServiceConfig {
    type Error = Box<dyn std::error::Error + Send + Sync>;
    /// Constructs a new [ObjectStoreBlobService] from a [Url] supported by
    /// [object_store].
    /// Any path suffix becomes the base path of the object store.
    /// additional options, the same as in [object_store::parse_url_opts] can
    /// be passed.
    fn try_from(url: url::Url) -> Result<Self, Self::Error> {
        // We need to convert the URL to string, strip the prefix there, and then
        // parse it back as url, as Url::set_scheme() rejects some of the transitions we want to do.
        let trimmed_url = {
            let s = url.to_string();
            let mut url = Url::parse(
                s.strip_prefix("objectstore+")
                    .ok_or(Error::StorageError("Missing objectstore uri".into()))?,
            )?;
            // trim the query pairs, they might contain credentials or local settings we don't want to send as-is.
            url.set_query(None);
            url
        };
        Ok(ObjectStoreBlobServiceConfig {
            object_store_url: trimmed_url.into(),
            object_store_options: url
                .query_pairs()
                .into_iter()
                .map(|(k, v)| (k.to_string(), v.to_string()))
                .collect(),
            avg_chunk_size: 256 * 1024,
        })
    }
}

#[async_trait]
impl ServiceBuilder for ObjectStoreBlobServiceConfig {
    type Output = dyn BlobService;
    async fn build<'a>(
        &'a self,
        instance_name: &str,
        _context: &CompositionContext,
    ) -> Result<Arc<dyn BlobService>, Box<dyn std::error::Error + Send + Sync + 'static>> {
        let (object_store, path) = object_store::parse_url_opts(
            &self.object_store_url.parse()?,
            &self.object_store_options,
        )?;
        Ok(Arc::new(ObjectStoreBlobService {
            instance_name: instance_name.to_string(),
            object_store: Arc::new(object_store),
            base_path: path,
            avg_chunk_size: self.avg_chunk_size,
        }))
    }
}

/// Reads blob contents from a AsyncRead, chunks and uploads them.
/// On success, returns a [StatBlobResponse] pointing to the individual chunks.
#[instrument(skip_all, fields(base_path=%base_path, min_chunk_size, avg_chunk_size, max_chunk_size), err)]
async fn chunk_and_upload<R: AsyncRead + Unpin>(
    r: R,
    object_store: Arc<dyn ObjectStore>,
    base_path: Path,
    min_chunk_size: u32,
    avg_chunk_size: u32,
    max_chunk_size: u32,
) -> io::Result<B3Digest> {
    // wrap reader with something calculating the blake3 hash of all data read.
    let mut b3_r = B3HashingReader::from(r);
    // set up a fastcdc chunker
    let mut chunker =
        AsyncStreamCDC::new(&mut b3_r, min_chunk_size, avg_chunk_size, max_chunk_size);

    /// This really should just belong into the closure at
    /// `chunker.as_stream().then(|_| { … })``, but if we try to, rustc spits
    /// higher-ranked lifetime errors at us.
    async fn fastcdc_chunk_uploader(
        resp: Result<fastcdc::v2020::ChunkData, fastcdc::v2020::Error>,
        base_path: Path,
        object_store: Arc<dyn ObjectStore>,
    ) -> std::io::Result<ChunkMeta> {
        let chunk_data = resp?;
        let chunk_digest: B3Digest = blake3::hash(&chunk_data.data).as_bytes().into();
        let chunk_path = derive_chunk_path(&base_path, &chunk_digest);

        upload_chunk(object_store, chunk_digest, chunk_path, chunk_data.data).await
    }

    // Use the fastcdc chunker to produce a stream of chunks, and upload these
    // that don't exist to the backend.
    let chunks = chunker
        .as_stream()
        .then(|resp| fastcdc_chunk_uploader(resp, base_path.clone(), object_store.clone()))
        .collect::<io::Result<Vec<ChunkMeta>>>()
        .await?;

    let chunks = if chunks.len() < 2 {
        // The chunker returned only one chunk, which is the entire blob.
        // According to the protocol, we must return an empty list of chunks
        // when the blob is not split up further.
        vec![]
    } else {
        chunks
    };

    let stat_blob_response = StatBlobResponse {
        chunks,
        bao: "".into(), // still todo
    };

    // check for Blob, if it doesn't exist, persist.
    let blob_digest: B3Digest = b3_r.digest().into();
    let blob_path = derive_blob_path(&base_path, &blob_digest);

    match object_store.head(&blob_path).await {
        // blob already exists, nothing to do
        Ok(_) => {
            trace!(
                blob.digest = %blob_digest,
                blob.path = %blob_path,
                "blob already exists on backend"
            );
        }
        // chunk does not yet exist, upload first
        Err(object_store::Error::NotFound { .. }) => {
            debug!(
                blob.digest = %blob_digest,
                blob.path = %blob_path,
                "uploading blob"
            );
            object_store
                .put(&blob_path, stat_blob_response.encode_to_vec().into())
                .await?;
        }
        Err(err) => {
            // other error
            Err(err)?
        }
    }

    Ok(blob_digest)
}

/// upload chunk if it doesn't exist yet.
#[instrument(skip_all, fields(chunk.digest = %chunk_digest, chunk.size = chunk_data.len(), chunk.path = %chunk_path), err)]
async fn upload_chunk(
    object_store: Arc<dyn ObjectStore>,
    chunk_digest: B3Digest,
    chunk_path: Path,
    chunk_data: Vec<u8>,
) -> std::io::Result<ChunkMeta> {
    let chunk_size = chunk_data.len();
    match object_store.head(&chunk_path).await {
        // chunk already exists, nothing to do
        Ok(_) => {
            debug!("chunk already exists");
        }

        // chunk does not yet exist, compress and upload.
        Err(object_store::Error::NotFound { .. }) => {
            let chunk_data_compressed =
                zstd::encode_all(Cursor::new(chunk_data), zstd::DEFAULT_COMPRESSION_LEVEL)?;

            debug!(chunk.compressed_size=%chunk_data_compressed.len(), "uploading chunk");

            object_store
                .as_ref()
                .put(&chunk_path, chunk_data_compressed.into())
                .await?;
        }
        // other error
        Err(err) => Err(err)?,
    }

    Ok(ChunkMeta {
        digest: chunk_digest.into(),
        size: chunk_size as u64,
    })
}

pin_project! {
    /// Takes care of blob uploads.
    /// All writes are relayed to self.writer, and we continuously poll the
    /// future (which will internally read from the other side of the pipe and
    /// upload chunks).
    /// Our BlobWriter::close() needs to drop self.writer, so the other side
    /// will read EOF and can finalize the blob.
    /// The future should then resolve and return the blob digest.
    pub struct ObjectStoreBlobWriter<W, Fut>
    where
        W: AsyncWrite,
        Fut: Future,
    {
        #[pin]
        writer: Option<W>,

        #[pin]
        fut: Option<Fut>,

        fut_output: Option<io::Result<B3Digest>>
    }
}

impl<W, Fut> tokio::io::AsyncWrite for ObjectStoreBlobWriter<W, Fut>
where
    W: AsyncWrite + Send + Unpin,
    Fut: Future,
{
    fn poll_write(
        self: std::pin::Pin<&mut Self>,
        cx: &mut std::task::Context<'_>,
        buf: &[u8],
    ) -> std::task::Poll<Result<usize, io::Error>> {
        let this = self.project();
        // poll the future.
        let fut = this.fut.as_pin_mut().expect("not future");
        let fut_p = fut.poll(cx);
        // if it's ready, the only way this could have happened is that the
        // upload failed, because we're only closing `self.writer` after all
        // writes happened.
        if fut_p.is_ready() {
            return Poll::Ready(Err(io::Error::other("upload failed")));
        }

        // write to the underlying writer
        this.writer
            .as_pin_mut()
            .expect("writer must be some")
            .poll_write(cx, buf)
    }

    fn poll_flush(
        self: std::pin::Pin<&mut Self>,
        cx: &mut std::task::Context<'_>,
    ) -> std::task::Poll<Result<(), io::Error>> {
        let this = self.project();
        // poll the future.
        let fut = this.fut.as_pin_mut().expect("not future");
        let fut_p = fut.poll(cx);
        // if it's ready, the only way this could have happened is that the
        // upload failed, because we're only closing `self.writer` after all
        // writes happened.
        if fut_p.is_ready() {
            return Poll::Ready(Err(io::Error::other("upload failed")));
        }

        // Call poll_flush on the writer
        this.writer
            .as_pin_mut()
            .expect("writer must be some")
            .poll_flush(cx)
    }

    fn poll_shutdown(
        self: std::pin::Pin<&mut Self>,
        _cx: &mut std::task::Context<'_>,
    ) -> std::task::Poll<Result<(), io::Error>> {
        // There's nothing to do on shutdown. We might have written some chunks
        // that are nowhere else referenced, but cleaning them up here would be racy.
        std::task::Poll::Ready(Ok(()))
    }
}

#[async_trait]
impl<W, Fut> BlobWriter for ObjectStoreBlobWriter<W, Fut>
where
    W: AsyncWrite + Send + Unpin,
    Fut: Future<Output = io::Result<B3Digest>> + Send + Unpin,
{
    async fn close(&mut self) -> io::Result<B3Digest> {
        match self.writer.take() {
            Some(mut writer) => {
                // shut down the writer, so the other side will read EOF.
                writer.shutdown().await?;

                // take out the future.
                let fut = self.fut.take().expect("fut must be some");
                // await it.
                let resp = pin!(fut).await;

                match resp.as_ref() {
                    // In the case of an Ok value, we store it in self.fut_output,
                    // so future calls to close can return that.
                    Ok(b3_digest) => {
                        self.fut_output = Some(Ok(b3_digest.clone()));
                    }
                    Err(e) => {
                        // for the error type, we need to cheat a bit, as
                        // they're not clone-able.
                        // Simply store a sloppy clone, with the same ErrorKind and message there.
                        self.fut_output = Some(Err(std::io::Error::new(e.kind(), e.to_string())))
                    }
                }
                resp
            }
            None => {
                // called a second time, return self.fut_output.
                match self.fut_output.as_ref().unwrap() {
                    Ok(ref b3_digest) => Ok(b3_digest.clone()),
                    Err(e) => Err(std::io::Error::new(e.kind(), e.to_string())),
                }
            }
        }
    }
}

#[cfg(test)]
mod test {
    use super::{chunk_and_upload, default_avg_chunk_size};
    use crate::{
        blobservice::{BlobService, ObjectStoreBlobService},
        fixtures::{BLOB_A, BLOB_A_DIGEST, BLOB_B, BLOB_B_DIGEST},
    };
    use std::{io::Cursor, sync::Arc};
    use url::Url;

    /// Tests chunk_and_upload directly, bypassing the BlobWriter at open_write().
    #[rstest::rstest]
    #[case::a(&BLOB_A, &BLOB_A_DIGEST)]
    #[case::b(&BLOB_B, &BLOB_B_DIGEST)]
    #[tokio::test]
    async fn test_chunk_and_upload(
        #[case] blob: &bytes::Bytes,
        #[case] blob_digest: &crate::B3Digest,
    ) {
        let (object_store, base_path) =
            object_store::parse_url(&Url::parse("memory:///").unwrap()).unwrap();
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::from(object_store);
        let blobsvc = Arc::new(ObjectStoreBlobService {
            instance_name: "test".into(),
            object_store: object_store.clone(),
            avg_chunk_size: default_avg_chunk_size(),
            base_path,
        });

        let inserted_blob_digest = chunk_and_upload(
            &mut Cursor::new(blob.to_vec()),
            object_store,
            object_store::path::Path::from("/"),
            1024 / 2,
            1024,
            1024 * 2,
        )
        .await
        .expect("chunk_and_upload succeeds");

        assert_eq!(blob_digest.clone(), inserted_blob_digest);

        // Now we should have the blob
        assert!(blobsvc.has(blob_digest).await.unwrap());

        // Check if it was chunked correctly
        let chunks = blobsvc.chunks(blob_digest).await.unwrap().unwrap();
        if blob.len() < 1024 / 2 {
            // The blob is smaller than the min chunk size, it should have been inserted as a whole
            assert!(chunks.is_empty());
        } else if blob.len() > 1024 * 2 {
            // The blob is larger than the max chunk size, make sure it was split up into at least
            // two chunks
            assert!(chunks.len() >= 2);
        }
    }
}