Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
VictorNicollet committed Jul 29, 2021
0 parents commit b799770
Show file tree
Hide file tree
Showing 30 changed files with 2,997 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
artifacts
bin
obj
_Resharper*
*.user
*.build.csdef
*.suo
.vs/
csx
Thumbs.db
test/azure_connection.txt
test/azure_dual_connection.txt
27 changes: 27 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Copyright (c) 2020, LOKAD SAS
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.

* Neither the name of LOKAD SAS nor the names of its contributors may be used to
endorse or promote products derived from this software without specific prior
written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 changes: 31 additions & 0 deletions Lokad.ContentAddr.Azure.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.29411.108
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Lokad.ContentAddr.Azure", "src\Lokad.ContentAddr.Azure.csproj", "{AC03061D-6376-4918-89AE-7B1D6661AC94}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Lokad.ContentAddr.Azure.Tests", "test\Lokad.ContentAddr.Azure.Tests.csproj", "{D348400F-7835-488E-A95E-79D1CB545327}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{AC03061D-6376-4918-89AE-7B1D6661AC94}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{AC03061D-6376-4918-89AE-7B1D6661AC94}.Debug|Any CPU.Build.0 = Debug|Any CPU
{AC03061D-6376-4918-89AE-7B1D6661AC94}.Release|Any CPU.ActiveCfg = Release|Any CPU
{AC03061D-6376-4918-89AE-7B1D6661AC94}.Release|Any CPU.Build.0 = Release|Any CPU
{D348400F-7835-488E-A95E-79D1CB545327}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D348400F-7835-488E-A95E-79D1CB545327}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D348400F-7835-488E-A95E-79D1CB545327}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D348400F-7835-488E-A95E-79D1CB545327}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {48447358-1B7D-47DD-98CB-1E8D476C31A4}
EndGlobalSection
EndGlobal
94 changes: 94 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
Azure Blob Storage support for [Lokad.ContentAddr](https://github.com/Lokad/ContentAddr).

The relevant types are `AzureStoreFactory` (for a multi-tenant store) and `AzureStore` (for
a single-tenant store). These implement the `IStoreFactory` and `IStore` interfaces from
the base Lokad.ContentAddr library.

## Data Layout

This creates two containers in the Azure Blob Storage account:

- `persist` will contain the persistent blobs, named `<realm>/<hash>` (for instance, a
blob `AC03061D6376491889AE7B1D6661AC94` in realm `85055` would be named
`85055/AC03061D6376491889AE7B1D6661AC94`).

- `staging` will contain temporary blobs as they are being uploaded to
the content-addressable store. This container can be safely emptied without
losing any committed data.

This library supports two upload models for uploading blobs:

### Uploading with the Lokad.ContentAddr library

This uses the `IStore` interface to push data to Azure Blob Storage.
For example:

```c#
AzureStore store = ...;
store.WriteAsync(new byte[] {...}, cancel);
```

More information about the implementation details (not relevant to normal use
of the library) follows:

Data is accumulated in-memory up to a certain point (around 4MB), after which
a block blob is created in the `staging` container, and the data is written
as separate blocks to that blob, to avoid consuming too much memory.

The hash of the blob is calculated on-the-fly.

Once the entire data has been provided, the hash becomes known. The library then
determines a blob with the same hash already exists for the tenant. If it does,
the temporary block blob is deleted. If it does not, the temporary block blob is
committed, then copied to the `persist` container with the appropriate hash.

### Uploading from a different source

It is also possible to give a different client an upload link with a shared access
signature (for instance, an in-browser JavaScript uploader). This is done
using the appropriate method on the store:

```c#
AzureStore store = ...;
string identifier = Guid.NewGuid().ToString();
TimeSpan life = TimeSpan.FromMinutes(10);
Url url = store.GetSignedUploadUrl(identifier, life);
```

This will create a Shared Access Signature that lasts 10 minutes, that allows the
client to write to that specific blob in the `staging` container.

Once the client has finished uploading, it should contact the issuer of the signed
URL in order to commit the upload to the store. The issuer then invokes:

```c#
// Same store and identifier as were used to generate the URL
AzureStore store = ...;
string identifier = ...;
await store.CommitTemporaryBlob(identifier, cancellationToken);
```

This causes the store to download the entire uploaded blob, compute its hash,
and then move it to the `persist` container (if a copy is not already present
there).

## Download links

It is possible to produce a short-lived URL that allows anyone to download a
blob from the `persist` container.

```c#
AzureStore store;
IAzureBlobRef blob = store[new Hash("AC03061D6376491889AE7B1D6661AC94")];
Url download = await blob.GetDownloadUrlAsync(
now: DateTime.UtcNow,
life: TimeSpan.FromMinutes(10),
filename: "budget.xlsx",
contentType: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
cancel: cancellationToken);
```

The URL contains a `Content-Disposition: attachment` header that causes browsers
to interpret it as a downloadable file (with the file name specified through the
`filename` argument of the function).

Binary file added lokad.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
196 changes: 196 additions & 0 deletions src/AzureBlobRef.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
using Microsoft.WindowsAzure.Storage;
using Microsoft.WindowsAzure.Storage.Blob;
using System;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;

namespace Lokad.ContentAddr.Azure
{
/// <summary> The <see cref="IReadBlobRef"/> for an azure persistent store. </summary>
/// <see cref="AzureReadOnlyStore"/>
/// <remarks>
/// Exposes the Azure Storage blob itself in <see cref="Blob"/>.
/// </remarks>
public sealed class AzureBlobRef : IAzureReadBlobRef
{
public AzureBlobRef(string realm, Hash hash, CloudBlockBlob blob)
{
Hash = hash;
Realm = realm;
Blob = blob;
}

/// <see cref="IReadBlobRef.Hash"/>
public Hash Hash { get; }

/// <summary> The blob name prefix. </summary>
/// <remarks>
/// Used to avoid accidentally mixing blobs from separate customers
/// by using a separate prefix (the "realm") for each customer.
/// </remarks>
public string Realm { get; }

/// <summary> The Azure Storage blob where the blob data is stored. </summary>
public CloudBlockBlob Blob { get; }

/// <see cref="IReadBlobRef.ExistsAsync"/>
public Task<bool> ExistsAsync(CancellationToken cancel) =>
Blob.ExistsAsync(null, null, cancel);

/// <see cref="IAzureReadBlobRef.GetBlob"/>
public Task<CloudBlockBlob> GetBlob() =>
Task.FromResult(Blob);

/// <see cref="IReadBlobRef.GetSizeAsync"/>
public async Task<long> GetSizeAsync(CancellationToken cancel)
{
// If properties are not loaded yet, they will either be null or
// contain a length of -1.
if (Blob.Properties != null && Blob.Properties.Length >= 0)
return Blob.Properties.Length;

try
{
await AzureRetry.Do(
c => Blob.FetchAttributesAsync(null, null, null, c),
cancel).ConfigureAwait(false);
return Blob.Properties.Length;
}
catch (StorageException e) when (e.RequestInformation.HttpStatusCode == 404)
{
throw new NoSuchBlobException(Realm, Hash);
}
}

/// <see cref="IReadBlobRef.OpenAsync"/>
public async Task<Stream> OpenAsync(CancellationToken cancel)
{
var size = await GetSizeAsync(cancel).ConfigureAwait(false);
return new AzureReadStream(Blob, size);
}

/// <see cref="IAzureReadBlobRef.GetDownloadUrlAsync"/>
public Task<Uri> GetDownloadUrlAsync(
DateTime now,
TimeSpan life,
string filename,
string contentType,
CancellationToken cancel)
{
var (asciiFilename, utf8Filename) = SanitizeFileName(filename);

// This Content-Disposition header will be returned by Azure along with the blob.
// It is constructed according to RFC6266
// https://tools.ietf.org/html/rfc6266
//
// If the file name is ASCII-only, we use the standard for `attachment;filename="foo.tsv"`
// which will cause the browser to save (because `attachment`) the blob as a file
// named `foo.tsv`.
//
// If the file name contains non-ASCII characters, we include the `filename=` as an
// ASCII-only fallback for older browsers (but it will be something like `data.tsv`
// instead of the actual file name), and the `filename*=UTF-8''f%c3%a4.tsv` for modern
// browsers, encoding according to RFC5987
// https://tools.ietf.org/html/rfc5987#section-3.2

var contentDisposition =
utf8Filename != null
? "attachment;filename=\"" + asciiFilename + "\";filename*=UTF-8''" + utf8Filename + ""
: "attachment;filename=\"" + asciiFilename + "\"";

var token = Blob.GetSharedAccessSignature(new SharedAccessBlobPolicy
{
Permissions = SharedAccessBlobPermissions.Read,
SharedAccessExpiryTime = new DateTimeOffset(now + life),
SharedAccessStartTime = new DateTimeOffset(now.AddMinutes(-5))
}, new SharedAccessBlobHeaders
{
ContentDisposition = contentDisposition,
ContentType = contentType,
ContentEncoding = null,
});

return Task.FromResult(new Uri(Blob.Uri, token));
}

/// <summary> Recognizes characters that will be replaced with a single '-'. </summary>
private static readonly Regex BadFilenameCharacter =
new Regex("[\\x00-\\x1F\\x7F/\\\\?%*:|*\"<>-]+", RegexOptions.Compiled);

/// <summary> Sanitizes a file name for the content-disposition header. </summary>
/// <remarks>
/// Empty file names are replaced with `"data"`.
///
/// Extension-only file names (like `".tsv"`) are prepended with `"data"`
/// (for example `"data.tsv"`)
///
/// Any characters that are not allowed in file names <see cref="BadFilenameCharacter"/>
/// are replaced with `-` and consecutive `-` are combined into one.
///
/// Any initial or final `-` or `.` are dropped.
///
/// If the string contains any non-ASCII characters, it will return a dummy `data.ext`
/// ASCII filename (keeping only the extension) and an UTF8 filename properly
/// encoded according to RFC5987 (without the preceding `UTF-8''`)
///
/// https://tools.ietf.org/html/rfc5987#section-3.2
///
/// If only ASCII characters are present, the returned UTF-8 filename will
/// be null.
/// </remarks>
public static (string ascii, string utf8) SanitizeFileName(string filename)
{
if (string.IsNullOrWhiteSpace(filename) || filename[0] == '.')
filename = "data" + filename;

filename = BadFilenameCharacter.Replace(filename, "-").Trim('-', '.');

if (filename == "")
return ("data", null);

if (filename.All(c => c < 127))
return (filename, null);

// Found UTF8 characters.

var utf8 = new StringBuilder();
var bytes = Encoding.UTF8.GetBytes(filename);
foreach (var b in bytes)
{
if (b >= 'a' && b <= 'z' ||
b >= 'A' && b <= 'Z' ||
b >= '0' && b <= '9' ||
b < 127 && (
b == '!' ||
b == '#' ||
b == '$' ||
b == '+' ||
b == '-' ||
b == '.' ||
b == '^' ||
b == '_' ||
b == '`' ||
b == '|' ||
b == '~'))
{
// Characters explicitly allowed by RFC5987 may be kept verbatim,
// all others are %-escaped.
utf8.Append((char)b);
}
else
{
utf8.Append($"%{b:x2}");
}
}

var ext = Path.GetExtension(filename) ?? ".bin";
if (ext == ".gz") ext = ".csv.gz";

return ("data" + ext, utf8.ToString());
}
}
}
Loading

0 comments on commit b799770

Please sign in to comment.