Skip to content

Download using xet #1305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d5529b1
Download using xet
coyotte508 Mar 21, 2025
f9b2761
fix file-download-info test
coyotte508 Mar 21, 2025
7c6b0a0
safetensors
coyotte508 Mar 21, 2025
1e6c638
load etag & size from xet payload
coyotte508 Mar 21, 2025
75b51b0
Merge branch 'main' into download-xet
coyotte508 Mar 22, 2025
b0b58db
Merge remote-tracking branch 'origin/main' into download-xet
coyotte508 Mar 24, 2025
b02c4e3
fix file-download-info
coyotte508 Mar 24, 2025
c4f5e40
update download-file tests
coyotte508 Mar 24, 2025
10761d4
doc
coyotte508 Mar 24, 2025
290d086
fix mocked tests
coyotte508 Mar 24, 2025
e604d27
fix E2E
coyotte508 Mar 24, 2025
f1c81be
Merge branch 'main' into download-xet
coyotte508 Apr 2, 2025
f0c8024
merge main
coyotte508 Apr 8, 2025
6a1d819
fix test again?
coyotte508 Apr 8, 2025
a698ecf
enable browser tests again
coyotte508 Apr 8, 2025
8a7b415
download web blob with access token
coyotte508 Apr 8, 2025
01789b5
fixup! download web blob with access token
coyotte508 Apr 8, 2025
411ed2d
reorder
coyotte508 Apr 8, 2025
6ab4ad1
test to download private file
coyotte508 Apr 8, 2025
ba9990f
throw errors in WebBlob
coyotte508 Apr 8, 2025
17d17a9
fixup! throw errors in WebBlob
coyotte508 Apr 8, 2025
754550e
Merge branch 'main' into download-xet
coyotte508 Apr 29, 2025
bbd15d1
remove 10s timeout
coyotte508 Apr 29, 2025
12ca93d
remove more timeouts
coyotte508 Apr 29, 2025
60af372
Fix E2Es maybe cc @SBrandeis @hanouticelina
coyotte508 Apr 29, 2025
f8d10cf
Merge branch 'main' into download-xet
coyotte508 Apr 29, 2025
b4edda3
increase timeout for browser
coyotte508 Apr 29, 2025
cd1ae6f
debug json format for file info in browser mode
coyotte508 Apr 29, 2025
7bd33f8
Merge remote-tracking branch 'origin/main' into download-xet
coyotte508 Apr 30, 2025
3786afc
use headers now
coyotte508 Apr 30, 2025
5030616
lint
coyotte508 Apr 30, 2025
bbc3a76
remove extra dev dep
coyotte508 Apr 30, 2025
e559afe
Merge remote-tracking branch 'origin/main' into download-xet
coyotte508 May 5, 2025
524e987
Use X-Linked-Etag header in priority
coyotte508 May 5, 2025
5e0a117
Merge branch 'main' into download-xet
coyotte508 May 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions packages/hub/src/lib/commit.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ describe("commit", () => {

try {
const readme1 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
assert.strictEqual(readme1?.status, 200);
assert(readme1, "Readme doesn't exist");

const nodeOperation: CommitFile[] = isFrontend
? []
Expand Down Expand Up @@ -77,11 +77,9 @@ describe("commit", () => {
});

const fileContent = await downloadFile({ repo, path: "test.txt", hubUrl: TEST_HUB_URL });
assert.strictEqual(fileContent?.status, 200);
assert.strictEqual(await fileContent?.text(), "This is me");

const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt", hubUrl: TEST_HUB_URL });
assert.strictEqual(lfsFileContent?.status, 200);
assert.strictEqual(await lfsFileContent?.text(), lfsContent);

const lfsFileUrl = `${TEST_HUB_URL}/${repoName}/raw/main/test.lfs.txt`;
Expand All @@ -98,15 +96,13 @@ size ${lfsContent.length}

if (!isFrontend) {
const fileUrlContent = await downloadFile({ repo, path: "tsconfig.json", hubUrl: TEST_HUB_URL });
assert.strictEqual(fileUrlContent?.status, 200);
assert.strictEqual(
await fileUrlContent?.text(),
(await import("node:fs")).readFileSync("./tsconfig.json", "utf-8")
);
}

const webResourceContent = await downloadFile({ repo, path: "lamaral.json", hubUrl: TEST_HUB_URL });
assert.strictEqual(webResourceContent?.status, 200);
assert.strictEqual(await webResourceContent?.text(), await (await fetch(tokenizerJsonUrl)).text());

const readme2 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
Expand Down
15 changes: 10 additions & 5 deletions packages/hub/src/lib/download-file-to-cache-dir.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
import { dirname, join } from "node:path";
import { writeFile, rename, lstat, mkdir, stat } from "node:fs/promises";
import { rename, lstat, mkdir, stat } from "node:fs/promises";
import type { CommitInfo, PathInfo } from "./paths-info";
import { pathsInfo } from "./paths-info";
import type { CredentialsParams, RepoDesignation } from "../types/public";
import { toRepoId } from "../utils/toRepoId";
import { downloadFile } from "./download-file";
import { createSymlink } from "../utils/symlink";
import { Readable } from "node:stream";
import type { ReadableStream } from "node:stream/web";
import { pipeline } from "node:stream/promises";
import { createWriteStream } from "node:fs";

export const REGEX_COMMIT_HASH: RegExp = new RegExp("^[0-9a-f]{40}$");

Expand Down Expand Up @@ -115,15 +119,16 @@ export async function downloadFileToCacheDir(
const incomplete = `${blobPath}.incomplete`;
console.debug(`Downloading ${params.path} to ${incomplete}`);

const response: Response | null = await downloadFile({
const blob: Blob | null = await downloadFile({
...params,
revision: commitHash,
});

if (!response || !response.ok || !response.body) throw new Error(`invalid response for file ${params.path}`);
if (!blob) {
throw new Error(`invalid response for file ${params.path}`);
}

// @ts-expect-error resp.body is a Stream, but Stream in internal to node
await writeFile(incomplete, response.body);
await pipeline(Readable.fromWeb(blob.stream() as ReadableStream), createWriteStream(incomplete));

// rename .incomplete file to expect blob
await rename(incomplete, blobPath);
Expand Down
13 changes: 13 additions & 0 deletions packages/hub/src/lib/download-file.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,17 @@ describe("downloadFile", () => {
});
}).rejects.toThrowError("Dummy internal error");
});

test("should downoad xet file", async () => {
const blob = await downloadFile({
repo: {
type: "model",
name: "celinah/xet-experiments",
},
path: "large_text.txt",
});

const text = await blob?.slice(0, 100).text();
expect(text).toMatch("this is a text file.".repeat(10).slice(0, 100));
});
});
53 changes: 23 additions & 30 deletions packages/hub/src/lib/download-file.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { HUB_URL } from "../consts";
import { createApiError } from "../error";
import type { CredentialsParams, RepoDesignation } from "../types/public";
import { checkCredentials } from "../utils/checkCredentials";
import { toRepoId } from "../utils/toRepoId";
import { WebBlob } from "../utils/WebBlob";
import { XetBlob } from "../utils/XetBlob";
import { fileDownloadInfo } from "./file-download-info";

/**
* @returns null when the file doesn't exist
Expand All @@ -23,43 +23,36 @@ export async function downloadFile(
* @default "main"
*/
revision?: string;
/**
* Fetch only a specific part of the file
*/
range?: [number, number];
hubUrl?: string;
/**
* Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
*/
fetch?: typeof fetch;
} & Partial<CredentialsParams>
): Promise<Response | null> {
): Promise<Blob | null> {
const accessToken = checkCredentials(params);
const repoId = toRepoId(params.repo);
const url = `${params.hubUrl ?? HUB_URL}/${repoId.type === "model" ? "" : `${repoId.type}s/`}${repoId.name}/${
params.raw ? "raw" : "resolve"
}/${encodeURIComponent(params.revision ?? "main")}/${params.path}`;

const resp = await (params.fetch ?? fetch)(url, {
headers: {
...(accessToken
? {
Authorization: `Bearer ${accessToken}`,
}
: {}),
...(params.range
? {
Range: `bytes=${params.range[0]}-${params.range[1]}`,
}
: {}),
},
const info = await fileDownloadInfo({
repo: params.repo,
path: params.path,
revision: params.revision,
hubUrl: params.hubUrl,
fetch: params.fetch,
raw: params.raw,
});

if (resp.status === 404 && resp.headers.get("X-Error-Code") === "EntryNotFound") {
if (!info) {
return null;
} else if (!resp.ok) {
throw await createApiError(resp);
}

return resp;
if (info.xet) {
return new XetBlob({
hash: info.xet.hash,
refreshUrl: info.xet.refreshUrl,
fetch: params.fetch,
accessToken,
size: info.size,
});
}

return new WebBlob(new URL(info.url), 0, info.size, "", true, params.fetch ?? fetch);
}
14 changes: 12 additions & 2 deletions packages/hub/src/lib/file-download-info.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ describe("fileDownloadInfo", () => {

assert.strictEqual(info?.size, 536063208);
assert.strictEqual(info?.etag, '"41a0e56472bad33498744818c8b1ef2c-64"');
assert(info?.downloadLink);
});

it("should fetch raw LFS pointer info", async () => {
Expand All @@ -30,7 +29,6 @@ describe("fileDownloadInfo", () => {

assert.strictEqual(info?.size, 134);
assert.strictEqual(info?.etag, '"9eb98c817f04b051b3bcca591bcd4e03cec88018"');
assert(!info?.downloadLink);
});

it("should fetch non-LFS file info", async () => {
Expand All @@ -46,4 +44,16 @@ describe("fileDownloadInfo", () => {
assert.strictEqual(info?.size, 28);
assert.strictEqual(info?.etag, '"a661b1a138dac6dc5590367402d100765010ffd6"');
});

it("should fetch xet file info", async () => {
const info = await fileDownloadInfo({
repo: {
type: "model",
name: "celinah/xet-experiments",
},
path: "large_text.txt",
});
assert.strictEqual(info?.size, 62914580);
assert.strictEqual(info?.etag, '" c27f98578d9363b27db0bc1cbd9c692f8e6e90ae98c38cee7bc0a88829debd17"');
});
});
33 changes: 28 additions & 5 deletions packages/hub/src/lib/file-download-info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@ import type { CredentialsParams, RepoDesignation } from "../types/public";
import { checkCredentials } from "../utils/checkCredentials";
import { toRepoId } from "../utils/toRepoId";

interface XetInfo {
hash: string;
refreshUrl: string;
}

export interface FileDownloadInfoOutput {
size: number;
etag: string;
/**
* In case of LFS file, link to download directly from cloud provider
*/
downloadLink: string | null;
xet?: {
hash: string;
refreshUrl: string;
};
// URL to fetch (with the access token if private file)
url: string;
}
/**
* @returns null when the file doesn't exist
Expand Down Expand Up @@ -54,6 +61,7 @@ export async function fileDownloadInfo(
Authorization: `Bearer ${accessToken}`,
}),
Range: "bytes=0-0",
Accept: "application/vnd.xet-fileinfo+json, */*",
},
});

Expand Down Expand Up @@ -84,9 +92,24 @@ export async function fileDownloadInfo(
throw new InvalidApiResponseFormatError("Invalid file size received");
}

let xetInfo: XetInfo | undefined;
if (resp.headers.get("Content-Type") === "application/vnd.xet-fileinfo+json") {
const json: { casUrl: string; hash: string; refreshUrl: string } = await resp.json();

xetInfo = {
hash: json.hash,
refreshUrl: json.refreshUrl,
};
}

return {
etag,
size,
downloadLink: new URL(resp.url).hostname !== new URL(hubUrl).hostname ? resp.url : null,
xet: xetInfo,
// Cannot use resp.url in case it's a S3 url and the user adds an Authorization header to it.
url:
new URL(resp.url).hostname === new URL(hubUrl).hostname || resp.headers.get("X-Cache")?.endsWith(" cloudfront")
? resp.url
: url,
};
}
25 changes: 7 additions & 18 deletions packages/hub/src/lib/parse-safetensors-metadata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,13 @@ async function parseSingleFile(
fetch?: typeof fetch;
} & Partial<CredentialsParams>
): Promise<SafetensorsFileHeader> {
const firstResp = await downloadFile({
...params,
path,
range: [0, 7],
});
const blob = await downloadFile({ ...params, path });

if (!firstResp) {
if (!blob) {
throw new SafetensorParseError(`Failed to parse file ${path}: failed to fetch safetensors header length.`);
}

const bufLengthOfHeaderLE = await firstResp.arrayBuffer();
const bufLengthOfHeaderLE = await blob.slice(0, 8).arrayBuffer();
const lengthOfHeader = new DataView(bufLengthOfHeaderLE).getBigUint64(0, true);
// ^little-endian
if (lengthOfHeader <= 0) {
Expand All @@ -111,15 +107,9 @@ async function parseSingleFile(
);
}

const secondResp = await downloadFile({ ...params, path, range: [8, 7 + Number(lengthOfHeader)] });

if (!secondResp) {
throw new SafetensorParseError(`Failed to parse file ${path}: failed to fetch safetensors header.`);
}

try {
// no validation for now, we assume it's a valid FileHeader.
const header: SafetensorsFileHeader = await secondResp.json();
const header: SafetensorsFileHeader = JSON.parse(await blob.slice(8, 8 + Number(lengthOfHeader)).text());
return header;
} catch (err) {
throw new SafetensorParseError(`Failed to parse file ${path}: safetensors header is not valid JSON.`);
Expand All @@ -138,20 +128,19 @@ async function parseShardedIndex(
fetch?: typeof fetch;
} & Partial<CredentialsParams>
): Promise<{ index: SafetensorsIndexJson; headers: SafetensorsShardedHeaders }> {
const indexResp = await downloadFile({
const indexBlob = await downloadFile({
...params,
path,
range: [0, 10_000_000],
});

if (!indexResp) {
if (!indexBlob) {
throw new SafetensorParseError(`Failed to parse file ${path}: failed to fetch safetensors index.`);
}

// no validation for now, we assume it's a valid IndexJson.
let index: SafetensorsIndexJson;
try {
index = await indexResp.json();
index = JSON.parse(await indexBlob.slice(0, 10_000_000).text());
} catch (error) {
throw new SafetensorParseError(`Failed to parse file ${path}: not a valid JSON.`);
}
Expand Down
Loading
Loading