From d5529b1b59ba5e1147a5cbd74aa9add0ed756f11 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Fri, 21 Mar 2025 14:55:20 +0100 Subject: [PATCH 01/26] Download using xet --- packages/hub/src/lib/commit.spec.ts | 6 +- .../hub/src/lib/download-file-to-cache-dir.ts | 15 +++-- packages/hub/src/lib/download-file.spec.ts | 13 ++++ packages/hub/src/lib/download-file.ts | 49 +++++++------- packages/hub/src/lib/file-download-info.ts | 33 ++++++++-- packages/hub/src/utils/WebBlob.ts | 6 ++ packages/hub/src/utils/XetBlob.spec.ts | 66 ++++--------------- packages/hub/src/utils/XetBlob.ts | 44 ++++++------- 8 files changed, 114 insertions(+), 118 deletions(-) diff --git a/packages/hub/src/lib/commit.spec.ts b/packages/hub/src/lib/commit.spec.ts index 617be8ee89..024155bbc7 100644 --- a/packages/hub/src/lib/commit.spec.ts +++ b/packages/hub/src/lib/commit.spec.ts @@ -33,7 +33,7 @@ describe("commit", () => { try { const readme1 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL }); - assert.strictEqual(readme1?.status, 200); + assert(readme1, "Readme doesn't exist"); const nodeOperation: CommitFile[] = isFrontend ? [] @@ -77,11 +77,9 @@ describe("commit", () => { }); const fileContent = await downloadFile({ repo, path: "test.txt", hubUrl: TEST_HUB_URL }); - assert.strictEqual(fileContent?.status, 200); assert.strictEqual(await fileContent?.text(), "This is me"); const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt", hubUrl: TEST_HUB_URL }); - assert.strictEqual(lfsFileContent?.status, 200); assert.strictEqual(await lfsFileContent?.text(), lfsContent); const lfsFileUrl = `${TEST_HUB_URL}/${repoName}/raw/main/test.lfs.txt`; @@ -98,7 +96,6 @@ size ${lfsContent.length} if (!isFrontend) { const fileUrlContent = await downloadFile({ repo, path: "tsconfig.json", hubUrl: TEST_HUB_URL }); - assert.strictEqual(fileUrlContent?.status, 200); assert.strictEqual( await fileUrlContent?.text(), (await import("node:fs")).readFileSync("./tsconfig.json", "utf-8") @@ -106,7 +103,6 @@ size ${lfsContent.length} } const webResourceContent = await downloadFile({ repo, path: "lamaral.json", hubUrl: TEST_HUB_URL }); - assert.strictEqual(webResourceContent?.status, 200); assert.strictEqual(await webResourceContent?.text(), await (await fetch(tokenizerJsonUrl)).text()); const readme2 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL }); diff --git a/packages/hub/src/lib/download-file-to-cache-dir.ts b/packages/hub/src/lib/download-file-to-cache-dir.ts index 4059979a8e..ea29766453 100644 --- a/packages/hub/src/lib/download-file-to-cache-dir.ts +++ b/packages/hub/src/lib/download-file-to-cache-dir.ts @@ -1,12 +1,16 @@ import { getHFHubCachePath, getRepoFolderName } from "./cache-management"; import { dirname, join } from "node:path"; -import { writeFile, rename, lstat, mkdir, stat } from "node:fs/promises"; +import { rename, lstat, mkdir, stat } from "node:fs/promises"; import type { CommitInfo, PathInfo } from "./paths-info"; import { pathsInfo } from "./paths-info"; import type { CredentialsParams, RepoDesignation } from "../types/public"; import { toRepoId } from "../utils/toRepoId"; import { downloadFile } from "./download-file"; import { createSymlink } from "../utils/symlink"; +import { Readable } from "node:stream"; +import type { ReadableStream } from "node:stream/web"; +import { pipeline } from "node:stream/promises"; +import { createWriteStream } from "node:fs"; export const REGEX_COMMIT_HASH: RegExp = new RegExp("^[0-9a-f]{40}$"); @@ -115,15 +119,16 @@ export async function downloadFileToCacheDir( const incomplete = `${blobPath}.incomplete`; console.debug(`Downloading ${params.path} to ${incomplete}`); - const response: Response | null = await downloadFile({ + const blob: Blob | null = await downloadFile({ ...params, revision: commitHash, }); - if (!response || !response.ok || !response.body) throw new Error(`invalid response for file ${params.path}`); + if (!blob) { + throw new Error(`invalid response for file ${params.path}`); + } - // @ts-expect-error resp.body is a Stream, but Stream in internal to node - await writeFile(incomplete, response.body); + await pipeline(Readable.fromWeb(blob.stream() as ReadableStream), createWriteStream(incomplete)); // rename .incomplete file to expect blob await rename(incomplete, blobPath); diff --git a/packages/hub/src/lib/download-file.spec.ts b/packages/hub/src/lib/download-file.spec.ts index 01fc64c945..861cbac7c0 100644 --- a/packages/hub/src/lib/download-file.spec.ts +++ b/packages/hub/src/lib/download-file.spec.ts @@ -62,4 +62,17 @@ describe("downloadFile", () => { }); }).rejects.toThrowError("Dummy internal error"); }); + + test("should downoad xet file", async () => { + const blob = await downloadFile({ + repo: { + type: "model", + name: "celinah/xet-experiments", + }, + path: "large_text.txt", + }); + + const text = await blob?.slice(0, 100).text(); + expect(text).toMatch("this is a text file.".repeat(10).slice(0, 100)); + }); }); diff --git a/packages/hub/src/lib/download-file.ts b/packages/hub/src/lib/download-file.ts index 4f6ebde2e3..6b52fdb8e4 100644 --- a/packages/hub/src/lib/download-file.ts +++ b/packages/hub/src/lib/download-file.ts @@ -1,8 +1,8 @@ -import { HUB_URL } from "../consts"; -import { createApiError } from "../error"; import type { CredentialsParams, RepoDesignation } from "../types/public"; import { checkCredentials } from "../utils/checkCredentials"; -import { toRepoId } from "../utils/toRepoId"; +import { WebBlob } from "../utils/WebBlob"; +import { XetBlob } from "../utils/XetBlob"; +import { fileDownloadInfo } from "./file-download-info"; /** * @returns null when the file doesn't exist @@ -33,33 +33,30 @@ export async function downloadFile( */ fetch?: typeof fetch; } & Partial -): Promise { +): Promise { const accessToken = checkCredentials(params); - const repoId = toRepoId(params.repo); - const url = `${params.hubUrl ?? HUB_URL}/${repoId.type === "model" ? "" : `${repoId.type}s/`}${repoId.name}/${ - params.raw ? "raw" : "resolve" - }/${encodeURIComponent(params.revision ?? "main")}/${params.path}`; - - const resp = await (params.fetch ?? fetch)(url, { - headers: { - ...(accessToken - ? { - Authorization: `Bearer ${accessToken}`, - } - : {}), - ...(params.range - ? { - Range: `bytes=${params.range[0]}-${params.range[1]}`, - } - : {}), - }, + const info = await fileDownloadInfo({ + repo: params.repo, + path: params.path, + revision: params.revision, + hubUrl: params.hubUrl, + fetch: params.fetch, + raw: params.raw, }); - if (resp.status === 404 && resp.headers.get("X-Error-Code") === "EntryNotFound") { + if (!info) { return null; - } else if (!resp.ok) { - throw await createApiError(resp); } - return resp; + if (info.xet) { + return new XetBlob({ + hash: info.xet.hash, + refreshUrl: info.xet.refreshUrl, + fetch: params.fetch, + accessToken, + size: info.size, + }); + } + + return new WebBlob(new URL(info.url), 0, info.size, "", true, params.fetch ?? fetch); } diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 3dcc79ee9e..77d24df18b 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -4,13 +4,20 @@ import type { CredentialsParams, RepoDesignation } from "../types/public"; import { checkCredentials } from "../utils/checkCredentials"; import { toRepoId } from "../utils/toRepoId"; +interface XetInfo { + hash: string; + refreshUrl: string; +} + export interface FileDownloadInfoOutput { size: number; etag: string; - /** - * In case of LFS file, link to download directly from cloud provider - */ - downloadLink: string | null; + xet?: { + hash: string; + refreshUrl: string; + }; + // URL to fetch (with the access token if private file) + url: string; } /** * @returns null when the file doesn't exist @@ -54,6 +61,7 @@ export async function fileDownloadInfo( Authorization: `Bearer ${accessToken}`, }), Range: "bytes=0-0", + Accept: "application/vnd.xet-fileinfo+json, */*", }, }); @@ -84,9 +92,24 @@ export async function fileDownloadInfo( throw new InvalidApiResponseFormatError("Invalid file size received"); } + let xetInfo: XetInfo | undefined; + if (resp.headers.get("Content-Type") === "application/vnd.xet-fileinfo+json") { + const json: { casUrl: string; hash: string; refreshUrl: string } = await resp.json(); + + xetInfo = { + hash: json.hash, + refreshUrl: json.refreshUrl, + }; + } + return { etag, size, - downloadLink: new URL(resp.url).hostname !== new URL(hubUrl).hostname ? resp.url : null, + xet: xetInfo, + // Cannot use resp.url in case it's a S3 url and the user adds an Authorization header to it. + url: + new URL(resp.url).hostname === new URL(hubUrl).hostname || resp.headers.get("X-Cache")?.endsWith(" cloudfront") + ? resp.url + : url, }; } diff --git a/packages/hub/src/utils/WebBlob.ts b/packages/hub/src/utils/WebBlob.ts index ff9aa1e0d7..29e01dfee4 100644 --- a/packages/hub/src/utils/WebBlob.ts +++ b/packages/hub/src/utils/WebBlob.ts @@ -87,6 +87,12 @@ export class WebBlob extends Blob { return result.text(); } + override async bytes(): Promise { + const result = await this.fetchRange(); + + return new Uint8Array(await result.arrayBuffer()); + } + override stream(): ReturnType { const stream = new TransformStream(); diff --git a/packages/hub/src/utils/XetBlob.spec.ts b/packages/hub/src/utils/XetBlob.spec.ts index 3539ab3473..c4e6e55b58 100644 --- a/packages/hub/src/utils/XetBlob.spec.ts +++ b/packages/hub/src/utils/XetBlob.spec.ts @@ -6,12 +6,9 @@ import { sum } from "./sum"; describe("XetBlob", () => { it("should lazy load the first 22 bytes", async () => { const blob = new XetBlob({ - repo: { - type: "model", - name: "celinah/xet-experiments", - }, hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", size: 5_234_139_343, + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", }); expect(await blob.slice(10, 22).text()).toBe("__metadata__"); @@ -20,10 +17,7 @@ describe("XetBlob", () => { it("should load the first chunk correctly", async () => { let xorbCount = 0; const blob = new XetBlob({ - repo: { - type: "model", - name: "celinah/xet-experiments", - }, + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", size: 5_234_139_343, fetch: async (url, opts) => { @@ -51,10 +45,7 @@ describe("XetBlob", () => { it("should load just past the first chunk correctly", async () => { let xorbCount = 0; const blob = new XetBlob({ - repo: { - type: "model", - name: "celinah/xet-experiments", - }, + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", size: 5_234_139_343, fetch: async (url, opts) => { @@ -86,10 +77,7 @@ describe("XetBlob", () => { it("should load the first 200kB correctly", async () => { let xorbCount = 0; const blob = new XetBlob({ - repo: { - type: "model", - name: "celinah/xet-experiments", - }, + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", size: 5_234_139_343, fetch: async (url, opts) => { @@ -118,10 +106,7 @@ describe("XetBlob", () => { it("should load correctly when loading far into a chunk range", async () => { const blob = new XetBlob({ - repo: { - type: "model", - name: "celinah/xet-experiments", - }, + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", size: 5_234_139_343, internalLogging: true, @@ -145,10 +130,7 @@ describe("XetBlob", () => { it("should load text correctly when offset_into_range starts in a chunk further than the first", async () => { const blob = new XetBlob({ - repo: { - type: "model", - name: "celinah/xet-experiments", - }, + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", hash: "794efea76d8cb372bbe1385d9e51c3384555f3281e629903ecb6abeff7d54eec", size: 62_914_580, }); @@ -238,12 +220,8 @@ describe("XetBlob", () => { const blob = new XetBlob({ hash: "test", - repo: { - name: "test", - type: "model", - }, size: totalSize, - hubUrl: "https://huggingface.co", + refreshUrl: "https://huggingface.co", listener: (e) => debugged.push(e), fetch: async function (_url, opts) { const url = new URL(_url as string); @@ -345,12 +323,8 @@ describe("XetBlob", () => { const blob = new XetBlob({ hash: "test", - repo: { - name: "test", - type: "model", - }, size: totalSize, - hubUrl: "https://huggingface.co", + refreshUrl: "https://huggingface.co", listener: (e) => debugged.push(e), fetch: async function (_url, opts) { const url = new URL(_url as string); @@ -464,12 +438,8 @@ describe("XetBlob", () => { const blob = new XetBlob({ hash: "test", - repo: { - name: "test", - type: "model", - }, size: totalSize, - hubUrl: "https://huggingface.co", + refreshUrl: "https://huggingface.co", listener: (e) => debugged.push(e), fetch: async function (_url, opts) { const url = new URL(_url as string); @@ -578,12 +548,8 @@ describe("XetBlob", () => { const blob = new XetBlob({ hash: "test", - repo: { - name: "test", - type: "model", - }, size: totalSize, - hubUrl: "https://huggingface.co", + refreshUrl: "https://huggingface.co", listener: (e) => debugged.push(e), fetch: async function (_url, opts) { const url = new URL(_url as string); @@ -690,12 +656,8 @@ describe("XetBlob", () => { const blob = new XetBlob({ hash: "test", - repo: { - name: "test", - type: "model", - }, size: totalSize, - hubUrl: "https://huggingface.co", + refreshUrl: "https://huggingface.co", listener: (e) => debugged.push(e), fetch: async function (_url, opts) { const url = new URL(_url as string); @@ -801,12 +763,8 @@ describe("XetBlob", () => { const blob = new XetBlob({ hash: "test", - repo: { - name: "test", - type: "model", - }, size: totalSize, - hubUrl: "https://huggingface.co", + refreshUrl: "https://huggingface.co", listener: (e) => debugged.push(e), fetch: async function (_url, opts) { const url = new URL(_url as string); diff --git a/packages/hub/src/utils/XetBlob.ts b/packages/hub/src/utils/XetBlob.ts index ca91e5cbab..2a3bbcc883 100644 --- a/packages/hub/src/utils/XetBlob.ts +++ b/packages/hub/src/utils/XetBlob.ts @@ -1,8 +1,6 @@ -import { HUB_URL } from "../consts"; import { createApiError } from "../error"; -import type { CredentialsParams, RepoDesignation, RepoId } from "../types/public"; +import type { CredentialsParams } from "../types/public"; import { checkCredentials } from "./checkCredentials"; -import { toRepoId } from "./toRepoId"; import { decompress as lz4_decompress } from "../vendor/lz4js"; import { RangeList } from "./RangeList"; @@ -14,9 +12,9 @@ type XetBlobCreateOptions = { * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; - repo: RepoDesignation; hash: string; - hubUrl?: string; + // URL to get the access token from + refreshUrl: string; size: number; listener?: (arg: { event: "read" } | { event: "progress"; progress: { read: number; total: number } }) => void; internalLogging?: boolean; @@ -85,8 +83,7 @@ const CHUNK_HEADER_BYTES = 8; export class XetBlob extends Blob { fetch: typeof fetch; accessToken?: string; - repoId: RepoId; - hubUrl: string; + refreshUrl: string; hash: string; start = 0; end = 0; @@ -99,13 +96,12 @@ export class XetBlob extends Blob { this.fetch = params.fetch ?? fetch.bind(globalThis); this.accessToken = checkCredentials(params); - this.repoId = toRepoId(params.repo); - this.hubUrl = params.hubUrl ?? HUB_URL; + this.refreshUrl = params.refreshUrl; this.end = params.size; this.hash = params.hash; this.listener = params.listener; this.internalLogging = params.internalLogging ?? false; - this.hubUrl; + this.refreshUrl; } override get size(): number { @@ -115,9 +111,8 @@ export class XetBlob extends Blob { #clone() { const blob = new XetBlob({ fetch: this.fetch, - repo: this.repoId, hash: this.hash, - hubUrl: this.hubUrl, + refreshUrl: this.refreshUrl, size: this.size, }); @@ -156,7 +151,7 @@ export class XetBlob extends Blob { } this.#reconstructionInfoPromise = (async () => { - const connParams = await getAccessToken(this.repoId, this.accessToken, this.fetch, this.hubUrl); + const connParams = await getAccessToken(this.accessToken, this.fetch, this.refreshUrl); // debug( // `curl '${connParams.casUrl}/reconstruction/${this.hash}' -H 'Authorization: Bearer ${connParams.accessToken}'` @@ -489,6 +484,12 @@ export class XetBlob extends Blob { return new Response(result).arrayBuffer(); } + override async bytes(): Promise { + const result = await this.#fetch(); + + return new Uint8Array(await new Response(result).arrayBuffer()); + } + override async text(): Promise { const result = await this.#fetch(); @@ -525,8 +526,8 @@ const jwts: Map< } > = new Map(); -function cacheKey(params: { repoId: RepoId; initialAccessToken: string | undefined }): string { - return `${params.repoId.type}:${params.repoId.name}:${params.initialAccessToken}`; +function cacheKey(params: { refreshUrl: string; initialAccessToken: string | undefined }): string { + return JSON.stringify([params.refreshUrl, params.initialAccessToken]); } // exported for testing purposes @@ -592,12 +593,11 @@ export function bg4_regoup_bytes(bytes: Uint8Array): Uint8Array { } async function getAccessToken( - repoId: RepoId, initialAccessToken: string | undefined, customFetch: typeof fetch, - hubUrl: string + refreshUrl: string ): Promise<{ accessToken: string; casUrl: string }> { - const key = cacheKey({ repoId, initialAccessToken }); + const key = cacheKey({ refreshUrl, initialAccessToken }); const jwt = jwts.get(key); @@ -612,8 +612,7 @@ async function getAccessToken( } const promise = (async () => { - const url = `${hubUrl}/api/${repoId.type}s/${repoId.name}/xet-read-token/main`; - const resp = await customFetch(url, { + const resp = await customFetch(refreshUrl, { headers: { ...(initialAccessToken ? { @@ -629,11 +628,10 @@ async function getAccessToken( const json: { accessToken: string; casUrl: string; exp: number } = await resp.json(); const jwt = { - repoId, accessToken: json.accessToken, expiresAt: new Date(json.exp * 1000), initialAccessToken, - hubUrl, + refreshUrl, casUrl: json.casUrl, }; @@ -660,7 +658,7 @@ async function getAccessToken( }; })(); - jwtPromises.set(repoId.name, promise); + jwtPromises.set(key, promise); return promise; } From f9b2761c4bf82138fe103a091ca081da2c51e552 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Fri, 21 Mar 2025 15:02:09 +0100 Subject: [PATCH 02/26] fix file-download-info test --- packages/hub/src/lib/file-download-info.spec.ts | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/packages/hub/src/lib/file-download-info.spec.ts b/packages/hub/src/lib/file-download-info.spec.ts index bea4d6b716..7bdbf2f23a 100644 --- a/packages/hub/src/lib/file-download-info.spec.ts +++ b/packages/hub/src/lib/file-download-info.spec.ts @@ -14,7 +14,6 @@ describe("fileDownloadInfo", () => { assert.strictEqual(info?.size, 536063208); assert.strictEqual(info?.etag, '"41a0e56472bad33498744818c8b1ef2c-64"'); - assert(info?.downloadLink); }); it("should fetch raw LFS pointer info", async () => { @@ -30,7 +29,6 @@ describe("fileDownloadInfo", () => { assert.strictEqual(info?.size, 134); assert.strictEqual(info?.etag, '"9eb98c817f04b051b3bcca591bcd4e03cec88018"'); - assert(!info?.downloadLink); }); it("should fetch non-LFS file info", async () => { @@ -46,4 +44,16 @@ describe("fileDownloadInfo", () => { assert.strictEqual(info?.size, 28); assert.strictEqual(info?.etag, '"a661b1a138dac6dc5590367402d100765010ffd6"'); }); + + it("should fetch xet file info", async () => { + const info = await fileDownloadInfo({ + repo: { + type: "model", + name: "celinah/xet-experiments", + }, + path: "large_text.txt", + }); + assert.strictEqual(info?.size, 62914580); + assert.strictEqual(info?.etag, '" c27f98578d9363b27db0bc1cbd9c692f8e6e90ae98c38cee7bc0a88829debd17"'); + }); }); From 7c6b0a06daabf073c0004333bc926579a484bdd8 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Fri, 21 Mar 2025 15:07:47 +0100 Subject: [PATCH 03/26] safetensors --- packages/hub/src/lib/download-file.ts | 4 --- .../hub/src/lib/parse-safetensors-metadata.ts | 25 ++++++------------- packages/hub/src/utils/WebBlob.ts | 6 ----- packages/hub/src/utils/XetBlob.ts | 6 ----- 4 files changed, 7 insertions(+), 34 deletions(-) diff --git a/packages/hub/src/lib/download-file.ts b/packages/hub/src/lib/download-file.ts index 6b52fdb8e4..b86984c97a 100644 --- a/packages/hub/src/lib/download-file.ts +++ b/packages/hub/src/lib/download-file.ts @@ -23,10 +23,6 @@ export async function downloadFile( * @default "main" */ revision?: string; - /** - * Fetch only a specific part of the file - */ - range?: [number, number]; hubUrl?: string; /** * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. diff --git a/packages/hub/src/lib/parse-safetensors-metadata.ts b/packages/hub/src/lib/parse-safetensors-metadata.ts index 063a503c9f..ca43a00883 100644 --- a/packages/hub/src/lib/parse-safetensors-metadata.ts +++ b/packages/hub/src/lib/parse-safetensors-metadata.ts @@ -89,17 +89,13 @@ async function parseSingleFile( fetch?: typeof fetch; } & Partial ): Promise { - const firstResp = await downloadFile({ - ...params, - path, - range: [0, 7], - }); + const blob = await downloadFile({ ...params, path }); - if (!firstResp) { + if (!blob) { throw new SafetensorParseError(`Failed to parse file ${path}: failed to fetch safetensors header length.`); } - const bufLengthOfHeaderLE = await firstResp.arrayBuffer(); + const bufLengthOfHeaderLE = await blob.slice(0, 8).arrayBuffer(); const lengthOfHeader = new DataView(bufLengthOfHeaderLE).getBigUint64(0, true); // ^little-endian if (lengthOfHeader <= 0) { @@ -111,15 +107,9 @@ async function parseSingleFile( ); } - const secondResp = await downloadFile({ ...params, path, range: [8, 7 + Number(lengthOfHeader)] }); - - if (!secondResp) { - throw new SafetensorParseError(`Failed to parse file ${path}: failed to fetch safetensors header.`); - } - try { // no validation for now, we assume it's a valid FileHeader. - const header: SafetensorsFileHeader = await secondResp.json(); + const header: SafetensorsFileHeader = JSON.parse(await blob.slice(8, 8 + Number(lengthOfHeader)).text()); return header; } catch (err) { throw new SafetensorParseError(`Failed to parse file ${path}: safetensors header is not valid JSON.`); @@ -138,20 +128,19 @@ async function parseShardedIndex( fetch?: typeof fetch; } & Partial ): Promise<{ index: SafetensorsIndexJson; headers: SafetensorsShardedHeaders }> { - const indexResp = await downloadFile({ + const indexBlob = await downloadFile({ ...params, path, - range: [0, 10_000_000], }); - if (!indexResp) { + if (!indexBlob) { throw new SafetensorParseError(`Failed to parse file ${path}: failed to fetch safetensors index.`); } // no validation for now, we assume it's a valid IndexJson. let index: SafetensorsIndexJson; try { - index = await indexResp.json(); + index = JSON.parse(await indexBlob.slice(0, 10_000_000).text()); } catch (error) { throw new SafetensorParseError(`Failed to parse file ${path}: not a valid JSON.`); } diff --git a/packages/hub/src/utils/WebBlob.ts b/packages/hub/src/utils/WebBlob.ts index 29e01dfee4..ff9aa1e0d7 100644 --- a/packages/hub/src/utils/WebBlob.ts +++ b/packages/hub/src/utils/WebBlob.ts @@ -87,12 +87,6 @@ export class WebBlob extends Blob { return result.text(); } - override async bytes(): Promise { - const result = await this.fetchRange(); - - return new Uint8Array(await result.arrayBuffer()); - } - override stream(): ReturnType { const stream = new TransformStream(); diff --git a/packages/hub/src/utils/XetBlob.ts b/packages/hub/src/utils/XetBlob.ts index 2a3bbcc883..aed5852e73 100644 --- a/packages/hub/src/utils/XetBlob.ts +++ b/packages/hub/src/utils/XetBlob.ts @@ -484,12 +484,6 @@ export class XetBlob extends Blob { return new Response(result).arrayBuffer(); } - override async bytes(): Promise { - const result = await this.#fetch(); - - return new Uint8Array(await new Response(result).arrayBuffer()); - } - override async text(): Promise { const result = await this.#fetch(); From 1e6c638b27056c448f9fb1828fd5020c6f4e9c76 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Fri, 21 Mar 2025 15:34:08 +0100 Subject: [PATCH 04/26] load etag & size from xet payload --- packages/hub/src/lib/file-download-info.ts | 43 +++++++++++++--------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 77d24df18b..5a8a02ef46 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -73,33 +73,40 @@ export async function fileDownloadInfo( throw await createApiError(resp); } - const etag = resp.headers.get("ETag"); + let etag: string | undefined; + let size: number | undefined; + let xetInfo: XetInfo | undefined; + if (resp.headers.get("Content-Type") === "application/vnd.xet-fileinfo+json") { + const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = await resp.json(); - if (!etag) { - throw new InvalidApiResponseFormatError("Expected ETag"); + xetInfo = { + hash: json.hash, + refreshUrl: json.refreshUrl, + }; + + etag = json.etag; + size = parseInt(json.size); } - const contentRangeHeader = resp.headers.get("content-range"); + etag ??= resp.headers.get("ETag") ?? undefined; - if (!contentRangeHeader) { - throw new InvalidApiResponseFormatError("Expected size information"); + if (!etag) { + throw new InvalidApiResponseFormatError("Expected ETag"); } - const [, parsedSize] = contentRangeHeader.split("/"); - const size = parseInt(parsedSize); + if (size === undefined || isNaN(size)) { + const contentRangeHeader = resp.headers.get("content-range"); - if (isNaN(size)) { - throw new InvalidApiResponseFormatError("Invalid file size received"); - } + if (!contentRangeHeader) { + throw new InvalidApiResponseFormatError("Expected size information"); + } - let xetInfo: XetInfo | undefined; - if (resp.headers.get("Content-Type") === "application/vnd.xet-fileinfo+json") { - const json: { casUrl: string; hash: string; refreshUrl: string } = await resp.json(); + const [, parsedSize] = contentRangeHeader.split("/"); + size = parseInt(parsedSize); - xetInfo = { - hash: json.hash, - refreshUrl: json.refreshUrl, - }; + if (isNaN(size)) { + throw new InvalidApiResponseFormatError("Invalid file size received"); + } } return { From b02c4e32214509ddbae38435bf506d03b123d10c Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Mon, 24 Mar 2025 21:15:53 +0100 Subject: [PATCH 05/26] fix file-download-info --- packages/hub/src/lib/file-download-info.spec.ts | 2 +- packages/hub/src/lib/file-download-info.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/hub/src/lib/file-download-info.spec.ts b/packages/hub/src/lib/file-download-info.spec.ts index 7bdbf2f23a..51a0245413 100644 --- a/packages/hub/src/lib/file-download-info.spec.ts +++ b/packages/hub/src/lib/file-download-info.spec.ts @@ -54,6 +54,6 @@ describe("fileDownloadInfo", () => { path: "large_text.txt", }); assert.strictEqual(info?.size, 62914580); - assert.strictEqual(info?.etag, '" c27f98578d9363b27db0bc1cbd9c692f8e6e90ae98c38cee7bc0a88829debd17"'); + assert.strictEqual(info?.etag, '"c27f98578d9363b27db0bc1cbd9c692f8e6e90ae98c38cee7bc0a88829debd17"'); }); }); diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 5a8a02ef46..facd96aa8a 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -76,7 +76,7 @@ export async function fileDownloadInfo( let etag: string | undefined; let size: number | undefined; let xetInfo: XetInfo | undefined; - if (resp.headers.get("Content-Type") === "application/vnd.xet-fileinfo+json") { + if (resp.headers.get("Content-Type")?.includes("application/vnd.xet-fileinfo+json")) { const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = await resp.json(); xetInfo = { From c4f5e408dcce5f15c8ae77aea1716d616c20780c Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Mon, 24 Mar 2025 21:29:16 +0100 Subject: [PATCH 06/26] update download-file tests --- packages/hub/src/lib/download-file.spec.ts | 74 ++++++---------------- packages/hub/src/lib/download-file.ts | 2 +- packages/hub/src/lib/file-download-info.ts | 9 +-- 3 files changed, 23 insertions(+), 62 deletions(-) diff --git a/packages/hub/src/lib/download-file.spec.ts b/packages/hub/src/lib/download-file.spec.ts index 861cbac7c0..6f92b23d4c 100644 --- a/packages/hub/src/lib/download-file.spec.ts +++ b/packages/hub/src/lib/download-file.spec.ts @@ -1,68 +1,32 @@ -import { expect, test, describe, vi } from "vitest"; +import { expect, test, describe, assert } from "vitest"; import { downloadFile } from "./download-file"; -import type { RepoId } from "../types/public"; - -const DUMMY_REPO: RepoId = { - name: "hello-world", - type: "model", -}; describe("downloadFile", () => { - test("hubUrl params should overwrite HUB_URL", async () => { - const fetchMock: typeof fetch = vi.fn(); - vi.mocked(fetchMock).mockResolvedValue({ - status: 200, - ok: true, - } as Response); - - await downloadFile({ - repo: DUMMY_REPO, - path: "/README.md", - hubUrl: "http://dummy-hub", - fetch: fetchMock, + test("should download regular file", async () => { + const blob = await downloadFile({ + repo: { + type: "model", + name: "openai-community/gpt2", + }, + path: "README.md", }); - expect(fetchMock).toHaveBeenCalledWith("http://dummy-hub/hello-world/resolve/main//README.md", expect.anything()); - }); - - test("raw params should use raw url", async () => { - const fetchMock: typeof fetch = vi.fn(); - vi.mocked(fetchMock).mockResolvedValue({ - status: 200, - ok: true, - } as Response); + const text = await blob?.slice(0, 1000).text(); + assert( + text?.includes(`--- +language: en +tags: +- exbert - await downloadFile({ - repo: DUMMY_REPO, - path: "README.md", - raw: true, - fetch: fetchMock, - }); +license: mit +--- - expect(fetchMock).toHaveBeenCalledWith("https://huggingface.co/hello-world/raw/main/README.md", expect.anything()); - }); - test("internal server error should propagate the error", async () => { - const fetchMock: typeof fetch = vi.fn(); - vi.mocked(fetchMock).mockResolvedValue({ - status: 500, - ok: false, - headers: new Map([["Content-Type", "application/json"]]), - json: () => ({ - error: "Dummy internal error", - }), - } as unknown as Response); +# GPT-2 - await expect(async () => { - await downloadFile({ - repo: DUMMY_REPO, - path: "README.md", - raw: true, - fetch: fetchMock, - }); - }).rejects.toThrowError("Dummy internal error"); +Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large`) + ); }); - test("should downoad xet file", async () => { const blob = await downloadFile({ repo: { diff --git a/packages/hub/src/lib/download-file.ts b/packages/hub/src/lib/download-file.ts index b86984c97a..98f6cc9f04 100644 --- a/packages/hub/src/lib/download-file.ts +++ b/packages/hub/src/lib/download-file.ts @@ -47,7 +47,7 @@ export async function downloadFile( if (info.xet) { return new XetBlob({ hash: info.xet.hash, - refreshUrl: info.xet.refreshUrl, + refreshUrl: info.xet.refreshUrl.href, fetch: params.fetch, accessToken, size: info.size, diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index facd96aa8a..f6760dfb52 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -6,16 +6,13 @@ import { toRepoId } from "../utils/toRepoId"; interface XetInfo { hash: string; - refreshUrl: string; + refreshUrl: URL; } export interface FileDownloadInfoOutput { size: number; etag: string; - xet?: { - hash: string; - refreshUrl: string; - }; + xet?: XetInfo; // URL to fetch (with the access token if private file) url: string; } @@ -81,7 +78,7 @@ export async function fileDownloadInfo( xetInfo = { hash: json.hash, - refreshUrl: json.refreshUrl, + refreshUrl: new URL(json.refreshUrl, hubUrl), }; etag = json.etag; From 10761d455444e0c2e5ac9747893bd552f0eb02f2 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Mon, 24 Mar 2025 22:35:58 +0100 Subject: [PATCH 07/26] doc --- CONTRIBUTING.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 95d353300c..299772d208 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,9 +20,7 @@ It's not a hard requirement, but please consider using an icon from [Gitmoji](ht If you want to run only specific tests, you can do `pnpm test -- -t "test name"`. -You can also do `npx vitest ./packages/hub/src/utils/XetBlob.spec.ts` to run a specific test file. - -Or `cd packages/hub && npx vitest --browser.name=chrome --browser.headless --config vitest-browser.config.mts ./src/utils/XetBlob.spec.ts` to run browser tests on a specific file +You can also do `pnpm --filter hub test ./src/utils/XetBlob.spec.ts` to run a specific test file. ## Adding a package From 290d08644210c761f99db635dfffb1684936569a Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Mon, 24 Mar 2025 22:58:18 +0100 Subject: [PATCH 08/26] fix mocked tests --- .../lib/download-file-to-cache-dir.spec.ts | 31 +++++++++++++------ packages/hub/src/lib/file-download-info.ts | 3 +- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/packages/hub/src/lib/download-file-to-cache-dir.spec.ts b/packages/hub/src/lib/download-file-to-cache-dir.spec.ts index 29d17f2870..fb407c4c2e 100644 --- a/packages/hub/src/lib/download-file-to-cache-dir.spec.ts +++ b/packages/hub/src/lib/download-file-to-cache-dir.spec.ts @@ -1,16 +1,15 @@ import { expect, test, describe, vi, beforeEach } from "vitest"; import type { RepoDesignation, RepoId } from "../types/public"; import { dirname, join } from "node:path"; -import { lstat, mkdir, stat, symlink, writeFile, rename } from "node:fs/promises"; +import { lstat, mkdir, stat, symlink, rename } from "node:fs/promises"; import { pathsInfo } from "./paths-info"; -import type { Stats } from "node:fs"; +import { createWriteStream, type Stats } from "node:fs"; import { getHFHubCachePath, getRepoFolderName } from "./cache-management"; import { toRepoId } from "../utils/toRepoId"; import { downloadFileToCacheDir } from "./download-file-to-cache-dir"; import { createSymlink } from "../utils/symlink"; vi.mock("node:fs/promises", () => ({ - writeFile: vi.fn(), rename: vi.fn(), symlink: vi.fn(), lstat: vi.fn(), @@ -18,6 +17,10 @@ vi.mock("node:fs/promises", () => ({ stat: vi.fn(), })); +vi.mock("node:fs", () => ({ + createWriteStream: vi.fn(), +})); + vi.mock("./paths-info", () => ({ pathsInfo: vi.fn(), })); @@ -63,11 +66,15 @@ describe("downloadFileToCacheDir", () => { beforeEach(() => { vi.resetAllMocks(); // mock 200 request - vi.mocked(fetchMock).mockResolvedValue({ - status: 200, - ok: true, - body: "dummy-body", - } as unknown as Response); + vi.mocked(fetchMock).mockResolvedValue( + new Response("dummy-body", { + status: 200, + headers: { + etag: DUMMY_ETAG, + "Content-Range": "bytes 0-54/55", + }, + }) + ); // prevent to use caching vi.mocked(stat).mockRejectedValue(new Error("Do not exists")); @@ -235,6 +242,9 @@ describe("downloadFileToCacheDir", () => { }, ]); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any); + const output = await downloadFileToCacheDir({ repo: DUMMY_REPO, path: "/README.md", @@ -276,6 +286,9 @@ describe("downloadFileToCacheDir", () => { }, ]); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any); + await downloadFileToCacheDir({ repo: DUMMY_REPO, path: "/README.md", @@ -284,7 +297,7 @@ describe("downloadFileToCacheDir", () => { const incomplete = `${expectedBlob}.incomplete`; // 1. should write fetch#response#body to incomplete file - expect(writeFile).toHaveBeenCalledWith(incomplete, "dummy-body"); + expect(createWriteStream).toHaveBeenCalledWith(incomplete); // 2. should rename the incomplete to the blob expected name expect(rename).toHaveBeenCalledWith(incomplete, expectedBlob); // 3. should create symlink pointing to blob diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index f6760dfb52..98e28ddca8 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -112,7 +112,8 @@ export async function fileDownloadInfo( xet: xetInfo, // Cannot use resp.url in case it's a S3 url and the user adds an Authorization header to it. url: - new URL(resp.url).hostname === new URL(hubUrl).hostname || resp.headers.get("X-Cache")?.endsWith(" cloudfront") + resp.url && + (new URL(resp.url).hostname === new URL(hubUrl).hostname || resp.headers.get("X-Cache")?.endsWith(" cloudfront")) ? resp.url : url, }; From e604d2790c121c912602ece0386f7eb7377edc41 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Mon, 24 Mar 2025 23:01:16 +0100 Subject: [PATCH 09/26] fix E2E --- packages/jinja/test/e2e.test.js | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/packages/jinja/test/e2e.test.js b/packages/jinja/test/e2e.test.js index 3ce98e2342..1ec3d0f43f 100644 --- a/packages/jinja/test/e2e.test.js +++ b/packages/jinja/test/e2e.test.js @@ -716,12 +716,11 @@ describe("End-to-end tests", () => { it("should parse a chat template from the Hugging Face Hub", async () => { const repo = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"; - const tokenizerConfig = await ( - await downloadFile({ - repo, - path: "tokenizer_config.json", - }) - ).json(); + const blob = await downloadFile({ + repo, + path: "tokenizer_config.json", + }); + const tokenizerConfig = JSON.parse(await blob.text()); const template = new Template(tokenizerConfig.chat_template); const result = template.render(TEST_CUSTOM_TEMPLATES[repo].data); From 6a1d8193d29d2191f967d66a80da9344c17d1ac8 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 14:08:20 +0200 Subject: [PATCH 10/26] fix test again? --- packages/hub/src/lib/file-download-info.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/hub/src/lib/file-download-info.spec.ts b/packages/hub/src/lib/file-download-info.spec.ts index 37524c0468..d2be156626 100644 --- a/packages/hub/src/lib/file-download-info.spec.ts +++ b/packages/hub/src/lib/file-download-info.spec.ts @@ -13,7 +13,7 @@ describe("fileDownloadInfo", () => { }); assert.strictEqual(info?.size, 536063208); - assert.strictEqual(info?.etag, '"879c5715c18a0b7f051dd33f70f0a5c8dd1522e0a43f6f75520f16167f29279b"'); + assert.strictEqual(info?.etag, '"a7a17d6d844b5de815ccab5f42cad6d24496db3850a2a43d8258221018ce87d2"'); }); it("should fetch raw LFS pointer info", async () => { From a698ecf22c81dd4affcd2d2f9e5e648fb4ab7af1 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 14:09:20 +0200 Subject: [PATCH 11/26] enable browser tests again --- packages/hub/src/utils/XetBlob.spec.ts | 100 ++++++++++++------------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/packages/hub/src/utils/XetBlob.spec.ts b/packages/hub/src/utils/XetBlob.spec.ts index be90f9d058..e3233fab6d 100644 --- a/packages/hub/src/utils/XetBlob.spec.ts +++ b/packages/hub/src/utils/XetBlob.spec.ts @@ -71,62 +71,58 @@ describe("XetBlob", () => { expect(xorbCount).toBe(2); }); - // Doesn't work in chrome due to caching issues, it caches the partial output when the - // fetch is interrupted in the previous test and then uses that cached output in this test (that requires more data) - if (typeof window === "undefined") { - it("should load the first 200kB correctly", async () => { - let xorbCount = 0; - const blob = new XetBlob({ - refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", - hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", - size: 5_234_139_343, - fetch: async (url, opts) => { - if (typeof url === "string" && url.includes("/xorbs/")) { - xorbCount++; - } - return fetch(url, opts); - }, - // internalLogging: true, - }); - - const xetDownload = await blob.slice(0, 200_000).arrayBuffer(); - const bridgeDownload = await fetch( - "https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors", - { - headers: { - Range: "bytes=0-199999", - }, + it("should load the first 200kB correctly", async () => { + let xorbCount = 0; + const blob = new XetBlob({ + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", + hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", + size: 5_234_139_343, + fetch: async (url, opts) => { + if (typeof url === "string" && url.includes("/xorbs/")) { + xorbCount++; } - ).then((res) => res.arrayBuffer()); - - expect(xetDownload.byteLength).toBe(200_000); - expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload)); - expect(xorbCount).toBe(2); - }, 60_000); - - it("should load correctly when loading far into a chunk range", async () => { - const blob = new XetBlob({ - refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", - hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", - size: 5_234_139_343, - // internalLogging: true, - }); + return fetch(url, opts); + }, + // internalLogging: true, + }); - const xetDownload = await blob.slice(10_000_000, 10_100_000).arrayBuffer(); - const bridgeDownload = await fetch( - "https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors", - { - headers: { - Range: "bytes=10000000-10099999", - }, - } - ).then((res) => res.arrayBuffer()); + const xetDownload = await blob.slice(0, 200_000).arrayBuffer(); + const bridgeDownload = await fetch( + "https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors", + { + headers: { + Range: "bytes=0-199999", + }, + } + ).then((res) => res.arrayBuffer()); + + expect(xetDownload.byteLength).toBe(200_000); + expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload)); + expect(xorbCount).toBe(2); + }, 60_000); - console.log("xet", xetDownload.byteLength, "bridge", bridgeDownload.byteLength); - expect(new Uint8Array(xetDownload).length).toEqual(100_000); - expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload)); + it("should load correctly when loading far into a chunk range", async () => { + const blob = new XetBlob({ + refreshUrl: "https://huggingface.co/api/models/celinah/xet-experiments/xet-read-token/main", + hash: "7b3b6d07673a88cf467e67c1f7edef1a8c268cbf66e9dd9b0366322d4ab56d9b", + size: 5_234_139_343, + // internalLogging: true, }); - } + + const xetDownload = await blob.slice(10_000_000, 10_100_000).arrayBuffer(); + const bridgeDownload = await fetch( + "https://huggingface.co/celinah/xet-experiments/resolve/main/model5GB.safetensors", + { + headers: { + Range: "bytes=10000000-10099999", + }, + } + ).then((res) => res.arrayBuffer()); + + console.log("xet", xetDownload.byteLength, "bridge", bridgeDownload.byteLength); + expect(new Uint8Array(xetDownload).length).toEqual(100_000); + expect(new Uint8Array(xetDownload)).toEqual(new Uint8Array(bridgeDownload)); + }); it("should load text correctly when offset_into_range starts in a chunk further than the first", async () => { const blob = new XetBlob({ From 8a7b41517160db53d11e0c21f4d5e1e55f658312 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 14:25:09 +0200 Subject: [PATCH 12/26] download web blob with access token --- packages/hub/src/lib/download-file.ts | 39 +++++++++++++++------ packages/hub/src/lib/file-download-info.ts | 40 ++++++++++++---------- packages/hub/src/utils/WebBlob.spec.ts | 6 ++-- packages/hub/src/utils/WebBlob.ts | 30 +++++++++++++--- 4 files changed, 79 insertions(+), 36 deletions(-) diff --git a/packages/hub/src/lib/download-file.ts b/packages/hub/src/lib/download-file.ts index 98f6cc9f04..846fcd5ae5 100644 --- a/packages/hub/src/lib/download-file.ts +++ b/packages/hub/src/lib/download-file.ts @@ -2,6 +2,7 @@ import type { CredentialsParams, RepoDesignation } from "../types/public"; import { checkCredentials } from "../utils/checkCredentials"; import { WebBlob } from "../utils/WebBlob"; import { XetBlob } from "../utils/XetBlob"; +import type { FileDownloadInfoOutput } from "./file-download-info"; import { fileDownloadInfo } from "./file-download-info"; /** @@ -28,23 +29,41 @@ export async function downloadFile( * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; + /** + * Whether to use the xet protocol to download the file (if applicable). + * + * Currently there's experimental support for it, so it's not enabled by default. + * + * It will be enabled automatically in a future minor version. + * + * @default false + */ + xet?: boolean; + /** + * Can save an http request if provided + */ + downloadInfo?: FileDownloadInfoOutput; } & Partial ): Promise { const accessToken = checkCredentials(params); - const info = await fileDownloadInfo({ - repo: params.repo, - path: params.path, - revision: params.revision, - hubUrl: params.hubUrl, - fetch: params.fetch, - raw: params.raw, - }); + + const info = + params.downloadInfo ?? + (await fileDownloadInfo({ + accessToken, + repo: params.repo, + path: params.path, + revision: params.revision, + hubUrl: params.hubUrl, + fetch: params.fetch, + raw: params.raw, + })); if (!info) { return null; } - if (info.xet) { + if (info.xet && params.xet) { return new XetBlob({ hash: info.xet.hash, refreshUrl: info.xet.refreshUrl.href, @@ -54,5 +73,5 @@ export async function downloadFile( }); } - return new WebBlob(new URL(info.url), 0, info.size, "", true, params.fetch ?? fetch); + return new WebBlob(new URL(info.url), 0, info.size, "", true, params.fetch ?? fetch, accessToken); } diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 98e28ddca8..6ca4abd04f 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -4,15 +4,18 @@ import type { CredentialsParams, RepoDesignation } from "../types/public"; import { checkCredentials } from "../utils/checkCredentials"; import { toRepoId } from "../utils/toRepoId"; -interface XetInfo { +export interface XetFileInfo { hash: string; refreshUrl: URL; + /** + * Later, there will also be a `reconstructionUrl` that can be directly used instead of with the hash. + */ } export interface FileDownloadInfoOutput { size: number; etag: string; - xet?: XetInfo; + xet?: XetFileInfo; // URL to fetch (with the access token if private file) url: string; } @@ -72,7 +75,23 @@ export async function fileDownloadInfo( let etag: string | undefined; let size: number | undefined; - let xetInfo: XetInfo | undefined; + let xetInfo: XetFileInfo | undefined; + + if (size === undefined || isNaN(size)) { + const contentRangeHeader = resp.headers.get("content-range"); + + if (!contentRangeHeader) { + throw new InvalidApiResponseFormatError("Expected size information"); + } + + const [, parsedSize] = contentRangeHeader.split("/"); + size = parseInt(parsedSize); + + if (isNaN(size)) { + throw new InvalidApiResponseFormatError("Invalid file size received"); + } + } + if (resp.headers.get("Content-Type")?.includes("application/vnd.xet-fileinfo+json")) { const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = await resp.json(); @@ -91,21 +110,6 @@ export async function fileDownloadInfo( throw new InvalidApiResponseFormatError("Expected ETag"); } - if (size === undefined || isNaN(size)) { - const contentRangeHeader = resp.headers.get("content-range"); - - if (!contentRangeHeader) { - throw new InvalidApiResponseFormatError("Expected size information"); - } - - const [, parsedSize] = contentRangeHeader.split("/"); - size = parseInt(parsedSize); - - if (isNaN(size)) { - throw new InvalidApiResponseFormatError("Invalid file size received"); - } - } - return { etag, size, diff --git a/packages/hub/src/utils/WebBlob.spec.ts b/packages/hub/src/utils/WebBlob.spec.ts index 68ad69e0d3..242a51e08e 100644 --- a/packages/hub/src/utils/WebBlob.spec.ts +++ b/packages/hub/src/utils/WebBlob.spec.ts @@ -15,7 +15,7 @@ describe("WebBlob", () => { }); it("should create a WebBlob with a slice on the entire resource", async () => { - const webBlob = await WebBlob.create(resourceUrl, { cacheBelow: 0 }); + const webBlob = await WebBlob.create(resourceUrl, { cacheBelow: 0, accessToken: undefined }); expect(webBlob).toMatchObject({ url: resourceUrl, @@ -35,7 +35,7 @@ describe("WebBlob", () => { }); it("should create a WebBlob with a slice on the entire resource, cached", async () => { - const webBlob = await WebBlob.create(resourceUrl, { cacheBelow: 1_000_000 }); + const webBlob = await WebBlob.create(resourceUrl, { cacheBelow: 1_000_000, accessToken: undefined }); expect(webBlob).not.toBeInstanceOf(WebBlob); expect(webBlob.size).toBe(size); @@ -75,7 +75,7 @@ describe("WebBlob", () => { it("should create a slice on the file", async () => { const expectedText = fullText.slice(10, 20); - const slice = (await WebBlob.create(resourceUrl, { cacheBelow: 0 })).slice(10, 20); + const slice = (await WebBlob.create(resourceUrl, { cacheBelow: 0, accessToken: undefined })).slice(10, 20); expect(slice).toMatchObject({ url: resourceUrl, diff --git a/packages/hub/src/utils/WebBlob.ts b/packages/hub/src/utils/WebBlob.ts index ff9aa1e0d7..1fa7562f9a 100644 --- a/packages/hub/src/utils/WebBlob.ts +++ b/packages/hub/src/utils/WebBlob.ts @@ -14,12 +14,20 @@ interface WebBlobCreateOptions { * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; + accessToken: string | undefined; } export class WebBlob extends Blob { static async create(url: URL, opts?: WebBlobCreateOptions): Promise { const customFetch = opts?.fetch ?? fetch; - const response = await customFetch(url, { method: "HEAD" }); + const response = await customFetch(url, { + method: "HEAD", + ...(opts?.accessToken && { + headers: { + Authorization: `Bearer ${opts.accessToken}`, + }, + }), + }); const size = Number(response.headers.get("content-length")); const contentType = response.headers.get("content-type") || ""; @@ -29,7 +37,7 @@ export class WebBlob extends Blob { return await (await customFetch(url)).blob(); } - return new WebBlob(url, 0, size, contentType, true, customFetch); + return new WebBlob(url, 0, size, contentType, true, customFetch, opts?.accessToken); } private url: URL; @@ -38,8 +46,17 @@ export class WebBlob extends Blob { private contentType: string; private full: boolean; private fetch: typeof fetch; - - constructor(url: URL, start: number, end: number, contentType: string, full: boolean, customFetch: typeof fetch) { + private accessToken: string | undefined; + + constructor( + url: URL, + start: number, + end: number, + contentType: string, + full: boolean, + customFetch: typeof fetch, + accessToken: string | undefined + ) { super([]); this.url = url; @@ -48,6 +65,7 @@ export class WebBlob extends Blob { this.contentType = contentType; this.full = full; this.fetch = customFetch; + this.accessToken = accessToken; } override get size(): number { @@ -69,7 +87,8 @@ export class WebBlob extends Blob { Math.min(this.start + end, this.end), this.contentType, start === 0 && end === this.size ? this.full : false, - this.fetch + this.fetch, + this.accessToken ); return slice; @@ -105,6 +124,7 @@ export class WebBlob extends Blob { return fetch(this.url, { headers: { Range: `bytes=${this.start}-${this.end - 1}`, + ...(this.accessToken && { Authorization: `Bearer ${this.accessToken}` }), }, }); } From 01789b51e96713528bacf0b8a7b10affec677cad Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 14:28:10 +0200 Subject: [PATCH 13/26] fixup! download web blob with access token --- packages/hub/src/utils/createBlob.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/hub/src/utils/createBlob.ts b/packages/hub/src/utils/createBlob.ts index 0cf54206da..5d5f200a66 100644 --- a/packages/hub/src/utils/createBlob.ts +++ b/packages/hub/src/utils/createBlob.ts @@ -11,9 +11,9 @@ import { isFrontend } from "./isFrontend"; * From the frontend: * - support http resources with absolute or relative URLs */ -export async function createBlob(url: URL, opts?: { fetch?: typeof fetch }): Promise { +export async function createBlob(url: URL, opts?: { fetch?: typeof fetch; accessToken?: string }): Promise { if (url.protocol === "http:" || url.protocol === "https:") { - return WebBlob.create(url, { fetch: opts?.fetch }); + return WebBlob.create(url, { fetch: opts?.fetch, accessToken: opts?.accessToken }); } if (isFrontend) { From 411ed2da33f2f305421a522c8c934f4a7994cae1 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 14:35:06 +0200 Subject: [PATCH 14/26] reorder --- packages/hub/src/lib/file-download-info.ts | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 6ca4abd04f..2001c132cc 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -77,6 +77,18 @@ export async function fileDownloadInfo( let size: number | undefined; let xetInfo: XetFileInfo | undefined; + if (resp.headers.get("Content-Type")?.includes("application/vnd.xet-fileinfo+json")) { + const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = await resp.json(); + + xetInfo = { + hash: json.hash, + refreshUrl: new URL(json.refreshUrl, hubUrl), + }; + + etag = json.etag; + size = parseInt(json.size); + } + if (size === undefined || isNaN(size)) { const contentRangeHeader = resp.headers.get("content-range"); @@ -92,18 +104,6 @@ export async function fileDownloadInfo( } } - if (resp.headers.get("Content-Type")?.includes("application/vnd.xet-fileinfo+json")) { - const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = await resp.json(); - - xetInfo = { - hash: json.hash, - refreshUrl: new URL(json.refreshUrl, hubUrl), - }; - - etag = json.etag; - size = parseInt(json.size); - } - etag ??= resp.headers.get("ETag") ?? undefined; if (!etag) { From 6ab4ad1760aa7306096b7a7ec75158856f17e0c6 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 15:35:43 +0200 Subject: [PATCH 15/26] test to download private file --- packages/hub/src/lib/download-file.spec.ts | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/packages/hub/src/lib/download-file.spec.ts b/packages/hub/src/lib/download-file.spec.ts index 6f92b23d4c..6790a74ee8 100644 --- a/packages/hub/src/lib/download-file.spec.ts +++ b/packages/hub/src/lib/download-file.spec.ts @@ -1,5 +1,9 @@ import { expect, test, describe, assert } from "vitest"; import { downloadFile } from "./download-file"; +import { deleteRepo } from "./delete-repo"; +import { createRepo } from "./create-repo"; +import { TEST_ACCESS_TOKEN, TEST_HUB_URL, TEST_USER } from "../test/consts"; +import { insecureRandomString } from "../utils/insecureRandomString"; describe("downloadFile", () => { test("should download regular file", async () => { @@ -39,4 +43,40 @@ Test the whole generation capabilities here: https://transformer.huggingface.co/ const text = await blob?.slice(0, 100).text(); expect(text).toMatch("this is a text file.".repeat(10).slice(0, 100)); }); + + test("should download private file", async () => { + const repoName = `datasets/${TEST_USER}/TEST-${insecureRandomString()}`; + + const result = await createRepo({ + accessToken: TEST_ACCESS_TOKEN, + hubUrl: TEST_HUB_URL, + private: true, + repo: repoName, + files: [{ path: ".gitattributes", content: new Blob(["*.html filter=lfs diff=lfs merge=lfs -text"]) }], + }); + + assert.deepStrictEqual(result, { + repoUrl: `${TEST_HUB_URL}/${repoName}`, + }); + + try { + const blob = await downloadFile({ + repo: { type: "dataset", name: repoName }, + path: ".gitattributes", + hubUrl: TEST_HUB_URL, + accessToken: TEST_ACCESS_TOKEN, + }); + + assert(blob, "File should be found"); + + const text = await blob?.text(); + assert.strictEqual(text, "*.html filter=lfs diff=lfs merge=lfs -text"); + } finally { + await deleteRepo({ + repo: repoName, + hubUrl: TEST_HUB_URL, + credentials: { accessToken: TEST_ACCESS_TOKEN }, + }); + } + }); }); From ba9990fc1990bd191e3b48e19b9a8e7a0a53be8b Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 16:14:51 +0200 Subject: [PATCH 16/26] throw errors in WebBlob --- packages/hub/src/lib/download-file.spec.ts | 4 ++-- packages/hub/src/utils/WebBlob.ts | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/hub/src/lib/download-file.spec.ts b/packages/hub/src/lib/download-file.spec.ts index 6790a74ee8..c25a8b69b8 100644 --- a/packages/hub/src/lib/download-file.spec.ts +++ b/packages/hub/src/lib/download-file.spec.ts @@ -61,7 +61,7 @@ Test the whole generation capabilities here: https://transformer.huggingface.co/ try { const blob = await downloadFile({ - repo: { type: "dataset", name: repoName }, + repo: repoName, path: ".gitattributes", hubUrl: TEST_HUB_URL, accessToken: TEST_ACCESS_TOKEN, @@ -75,7 +75,7 @@ Test the whole generation capabilities here: https://transformer.huggingface.co/ await deleteRepo({ repo: repoName, hubUrl: TEST_HUB_URL, - credentials: { accessToken: TEST_ACCESS_TOKEN }, + accessToken: TEST_ACCESS_TOKEN, }); } }); diff --git a/packages/hub/src/utils/WebBlob.ts b/packages/hub/src/utils/WebBlob.ts index 1fa7562f9a..99cc8e0db0 100644 --- a/packages/hub/src/utils/WebBlob.ts +++ b/packages/hub/src/utils/WebBlob.ts @@ -2,6 +2,8 @@ * WebBlob is a Blob implementation for web resources that supports range requests. */ +import { createApiError } from "../error"; + interface WebBlobCreateOptions { /** * @default 1_000_000 @@ -119,13 +121,13 @@ export class WebBlob extends Blob { private fetchRange(): Promise { const fetch = this.fetch; // to avoid this.fetch() which is bound to the instance instead of globalThis if (this.full) { - return fetch(this.url); + return fetch(this.url).then((resp) => (resp.ok ? resp : createApiError(resp))); } return fetch(this.url, { headers: { Range: `bytes=${this.start}-${this.end - 1}`, ...(this.accessToken && { Authorization: `Bearer ${this.accessToken}` }), }, - }); + }).then((resp) => (resp.ok ? resp : createApiError(resp))); } } From 17d17a96e57f50a7cc6c1379ab93f594e27b0baa Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 8 Apr 2025 16:18:55 +0200 Subject: [PATCH 17/26] fixup! throw errors in WebBlob --- packages/hub/src/utils/WebBlob.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/hub/src/utils/WebBlob.ts b/packages/hub/src/utils/WebBlob.ts index 99cc8e0db0..364bd95094 100644 --- a/packages/hub/src/utils/WebBlob.ts +++ b/packages/hub/src/utils/WebBlob.ts @@ -121,7 +121,13 @@ export class WebBlob extends Blob { private fetchRange(): Promise { const fetch = this.fetch; // to avoid this.fetch() which is bound to the instance instead of globalThis if (this.full) { - return fetch(this.url).then((resp) => (resp.ok ? resp : createApiError(resp))); + return fetch(this.url, { + ...(this.accessToken && { + headers: { + Authorization: `Bearer ${this.accessToken}`, + }, + }), + }).then((resp) => (resp.ok ? resp : createApiError(resp))); } return fetch(this.url, { headers: { From bbd15d156de45e1f12cabbae4688a0fcc6470ec8 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 29 Apr 2025 14:46:56 +0200 Subject: [PATCH 18/26] remove 10s timeout --- packages/hub/src/lib/upload-files-with-progress.spec.ts | 2 +- packages/hub/src/lib/upload-files.spec.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/hub/src/lib/upload-files-with-progress.spec.ts b/packages/hub/src/lib/upload-files-with-progress.spec.ts index b088e55973..50a1b4d380 100644 --- a/packages/hub/src/lib/upload-files-with-progress.spec.ts +++ b/packages/hub/src/lib/upload-files-with-progress.spec.ts @@ -164,5 +164,5 @@ describe("uploadFilesWithProgress", () => { hubUrl: TEST_HUB_URL, }); } - }, 60_000); + }); }); diff --git a/packages/hub/src/lib/upload-files.spec.ts b/packages/hub/src/lib/upload-files.spec.ts index 89206c99c8..94258ad1b2 100644 --- a/packages/hub/src/lib/upload-files.spec.ts +++ b/packages/hub/src/lib/upload-files.spec.ts @@ -92,4 +92,4 @@ describe("uploadFiles", () => { }); } }); -}, 10_000); +}); From 12ca93d1455b39118d0153a8cbc62eb5fe9d5410 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 29 Apr 2025 14:54:18 +0200 Subject: [PATCH 19/26] remove more timeouts --- packages/hub/src/lib/create-repo.spec.ts | 2 +- packages/hub/src/lib/delete-files.spec.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/hub/src/lib/create-repo.spec.ts b/packages/hub/src/lib/create-repo.spec.ts index c1a39b9f81..92d4d6b51a 100644 --- a/packages/hub/src/lib/create-repo.spec.ts +++ b/packages/hub/src/lib/create-repo.spec.ts @@ -100,4 +100,4 @@ describe("createRepo", () => { credentials: { accessToken: TEST_ACCESS_TOKEN }, }); }); -}, 10_000); +}); diff --git a/packages/hub/src/lib/delete-files.spec.ts b/packages/hub/src/lib/delete-files.spec.ts index 558da6a6ba..8124d9afa0 100644 --- a/packages/hub/src/lib/delete-files.spec.ts +++ b/packages/hub/src/lib/delete-files.spec.ts @@ -78,4 +78,4 @@ describe("deleteFiles", () => { }); } }); -}, 10_000); +}); From 60af37260361796ef50859a70c9926401f3f1b68 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 29 Apr 2025 14:57:24 +0200 Subject: [PATCH 20/26] Fix E2Es maybe cc @SBrandeis @hanouticelina --- e2e/deno/index.ts | 1 - e2e/svelte/src/routes/+page.svelte | 1 - e2e/ts/src/index.ts | 1 - 3 files changed, 3 deletions(-) diff --git a/e2e/deno/index.ts b/e2e/deno/index.ts index 02d11fe249..e300b3c1b6 100644 --- a/e2e/deno/index.ts +++ b/e2e/deno/index.ts @@ -16,7 +16,6 @@ if (token) { console.log(tokenInfo); const sum = await hf.summarization({ - model: "facebook/bart-large-cnn", inputs: "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930.", parameters: { diff --git a/e2e/svelte/src/routes/+page.svelte b/e2e/svelte/src/routes/+page.svelte index 5c36447d5e..c798e055ed 100644 --- a/e2e/svelte/src/routes/+page.svelte +++ b/e2e/svelte/src/routes/+page.svelte @@ -13,7 +13,6 @@ } const sum = await hf.summarization({ - model: "facebook/bart-large-cnn", inputs: "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930.", parameters: { diff --git a/e2e/ts/src/index.ts b/e2e/ts/src/index.ts index e08467be8a..93e2643f5b 100644 --- a/e2e/ts/src/index.ts +++ b/e2e/ts/src/index.ts @@ -11,7 +11,6 @@ const hf = new InferenceClient(hfToken); if (hfToken) { const sum = await hf.summarization({ - model: "facebook/bart-large-cnn", inputs: "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930.", parameters: { From b4edda38d0cc37d255042bff0563d11c1597f898 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 29 Apr 2025 15:56:33 +0200 Subject: [PATCH 21/26] increase timeout for browser --- e2e/ts/.gitignore | 3 ++- packages/hub/vitest-browser.config.mts | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e/ts/.gitignore b/e2e/ts/.gitignore index 483a9c42c3..1b79bf832b 100644 --- a/e2e/ts/.gitignore +++ b/e2e/ts/.gitignore @@ -1 +1,2 @@ -package-lock.json \ No newline at end of file +package-lock.json +pnpm-lock.yaml \ No newline at end of file diff --git a/packages/hub/vitest-browser.config.mts b/packages/hub/vitest-browser.config.mts index e2e1e87f98..e0efe5657c 100644 --- a/packages/hub/vitest-browser.config.mts +++ b/packages/hub/vitest-browser.config.mts @@ -2,7 +2,7 @@ import { configDefaults, defineConfig } from "vitest/config"; export default defineConfig({ test: { - testTimeout: 30_000, + testTimeout: 60_000, exclude: [ ...configDefaults.exclude, "src/utils/FileBlob.spec.ts", From cd1ae6f19fc9fa215da31a69db467f757777553f Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Tue, 29 Apr 2025 16:00:41 +0200 Subject: [PATCH 22/26] debug json format for file info in browser mode --- packages/hub/src/lib/file-download-info.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 2001c132cc..6a20938c2c 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -78,7 +78,16 @@ export async function fileDownloadInfo( let xetInfo: XetFileInfo | undefined; if (resp.headers.get("Content-Type")?.includes("application/vnd.xet-fileinfo+json")) { - const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = await resp.json(); + const text = await resp.text(); + const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = (() => { + try { + return JSON.parse(text); + } catch (e) { + throw new InvalidApiResponseFormatError( + "Invalid JSON response: " + text + ", content-type: " + resp.headers.get("Content-Type") + ); + } + })(); xetInfo = { hash: json.hash, From 3786afcb49f26745a2126adfb5327fa73ee1f177 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Wed, 30 Apr 2025 14:58:51 +0200 Subject: [PATCH 23/26] use headers now --- packages/hub/src/lib/download-file.ts | 2 +- packages/hub/src/lib/file-download-info.ts | 44 +++++++++++++--------- packages/hub/src/utils/XetBlob.ts | 12 ++++-- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/packages/hub/src/lib/download-file.ts b/packages/hub/src/lib/download-file.ts index 846fcd5ae5..5174bc09da 100644 --- a/packages/hub/src/lib/download-file.ts +++ b/packages/hub/src/lib/download-file.ts @@ -65,8 +65,8 @@ export async function downloadFile( if (info.xet && params.xet) { return new XetBlob({ - hash: info.xet.hash, refreshUrl: info.xet.refreshUrl.href, + reconstructionUrl: info.xet.reconstructionUrl.href, fetch: params.fetch, accessToken, size: info.size, diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 6a20938c2c..539e5d04c2 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -2,14 +2,16 @@ import { HUB_URL } from "../consts"; import { createApiError, InvalidApiResponseFormatError } from "../error"; import type { CredentialsParams, RepoDesignation } from "../types/public"; import { checkCredentials } from "../utils/checkCredentials"; +import { parseLinkHeader } from "../utils/parseLinkHeader"; import { toRepoId } from "../utils/toRepoId"; export interface XetFileInfo { hash: string; refreshUrl: URL; /** - * Later, there will also be a `reconstructionUrl` that can be directly used instead of with the hash. + * Can be directly used instead of the hash. */ + reconstructionUrl: URL; } export interface FileDownloadInfoOutput { @@ -73,29 +75,35 @@ export async function fileDownloadInfo( throw await createApiError(resp); } - let etag: string | undefined; let size: number | undefined; let xetInfo: XetFileInfo | undefined; if (resp.headers.get("Content-Type")?.includes("application/vnd.xet-fileinfo+json")) { - const text = await resp.text(); - const json: { casUrl: string; hash: string; refreshUrl: string; size: string; etag: string } = (() => { - try { - return JSON.parse(text); - } catch (e) { - throw new InvalidApiResponseFormatError( - "Invalid JSON response: " + text + ", content-type: " + resp.headers.get("Content-Type") - ); - } - })(); + size = parseInt(resp.headers.get("X-Linked-Size") ?? "invalid"); + if (isNaN(size)) { + throw new InvalidApiResponseFormatError("Invalid file size received in X-Linked-Size header"); + } + + const hash = resp.headers.get("X-Xet-Hash"); + const links = parseLinkHeader(resp.headers.get("Link") ?? ""); + + const reconstructionUrl = URL.parse + ? URL.parse(links["xet-reconstruction-info"]) + : new URL(links["xet-reconstruction-info"]); + const refreshUrl = URL.parse ? URL.parse(links["xet-auth"]) : new URL(links["xet-auth"]); + if (!hash) { + throw new InvalidApiResponseFormatError("No hash received in X-Xet-Hash header"); + } + + if (!reconstructionUrl || !refreshUrl) { + throw new InvalidApiResponseFormatError("No xet-reconstruction-info or xet-auth link header"); + } xetInfo = { - hash: json.hash, - refreshUrl: new URL(json.refreshUrl, hubUrl), + hash, + refreshUrl, + reconstructionUrl, }; - - etag = json.etag; - size = parseInt(json.size); } if (size === undefined || isNaN(size)) { @@ -113,7 +121,7 @@ export async function fileDownloadInfo( } } - etag ??= resp.headers.get("ETag") ?? undefined; + const etag = resp.headers.get("ETag") ?? undefined; if (!etag) { throw new InvalidApiResponseFormatError("Expected ETag"); diff --git a/packages/hub/src/utils/XetBlob.ts b/packages/hub/src/utils/XetBlob.ts index aed5852e73..3b787b0e54 100644 --- a/packages/hub/src/utils/XetBlob.ts +++ b/packages/hub/src/utils/XetBlob.ts @@ -12,13 +12,13 @@ type XetBlobCreateOptions = { * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; - hash: string; // URL to get the access token from refreshUrl: string; size: number; listener?: (arg: { event: "read" } | { event: "progress"; progress: { read: number; total: number } }) => void; internalLogging?: boolean; -} & Partial; +} & ({ hash: string; reconstructionUrl?: string } | { hash?: string; reconstructionUrl: string }) & + Partial; export interface ReconstructionInfo { /** @@ -84,7 +84,8 @@ export class XetBlob extends Blob { fetch: typeof fetch; accessToken?: string; refreshUrl: string; - hash: string; + reconstructionUrl?: string; + hash?: string; start = 0; end = 0; internalLogging = false; @@ -98,6 +99,7 @@ export class XetBlob extends Blob { this.accessToken = checkCredentials(params); this.refreshUrl = params.refreshUrl; this.end = params.size; + this.reconstructionUrl = params.reconstructionUrl; this.hash = params.hash; this.listener = params.listener; this.internalLogging = params.internalLogging ?? false; @@ -113,6 +115,8 @@ export class XetBlob extends Blob { fetch: this.fetch, hash: this.hash, refreshUrl: this.refreshUrl, + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + reconstructionUrl: this.reconstructionUrl!, size: this.size, }); @@ -157,7 +161,7 @@ export class XetBlob extends Blob { // `curl '${connParams.casUrl}/reconstruction/${this.hash}' -H 'Authorization: Bearer ${connParams.accessToken}'` // ); - const resp = await this.fetch(`${connParams.casUrl}/reconstruction/${this.hash}`, { + const resp = await this.fetch(this.reconstructionUrl ?? `${connParams.casUrl}/reconstruction/${this.hash}`, { headers: { Authorization: `Bearer ${connParams.accessToken}`, Range: `bytes=${this.start}-${this.end - 1}`, From 50306166c6319587464c97fffff585cb9a7463a3 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Wed, 30 Apr 2025 15:07:56 +0200 Subject: [PATCH 24/26] lint --- packages/hub/src/lib/file-download-info.ts | 18 ++++++++++++++---- packages/hub/src/utils/createBlobs.ts | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 539e5d04c2..50af612ac6 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -87,10 +87,20 @@ export async function fileDownloadInfo( const hash = resp.headers.get("X-Xet-Hash"); const links = parseLinkHeader(resp.headers.get("Link") ?? ""); - const reconstructionUrl = URL.parse - ? URL.parse(links["xet-reconstruction-info"]) - : new URL(links["xet-reconstruction-info"]); - const refreshUrl = URL.parse ? URL.parse(links["xet-auth"]) : new URL(links["xet-auth"]); + const reconstructionUrl = (() => { + try { + return new URL(links["xet-reconstruction-info"]); + } catch { + return null; + } + })(); + const refreshUrl = (() => { + try { + return new URL(links["xet-auth"]); + } catch { + return null; + } + })(); if (!hash) { throw new InvalidApiResponseFormatError("No hash received in X-Xet-Hash header"); diff --git a/packages/hub/src/utils/createBlobs.ts b/packages/hub/src/utils/createBlobs.ts index ebe63ed5d1..1a261c4c48 100644 --- a/packages/hub/src/utils/createBlobs.ts +++ b/packages/hub/src/utils/createBlobs.ts @@ -15,10 +15,10 @@ import { isFrontend } from "./isFrontend"; export async function createBlobs( url: URL, destPath: string, - opts?: { fetch?: typeof fetch; maxFolderDepth?: number } + opts?: { fetch?: typeof fetch; maxFolderDepth?: number; accessToken?: string } ): Promise> { if (url.protocol === "http:" || url.protocol === "https:") { - const blob = await WebBlob.create(url, { fetch: opts?.fetch }); + const blob = await WebBlob.create(url, { fetch: opts?.fetch, accessToken: opts?.accessToken }); return [{ path: destPath, blob }]; } From bbc3a7637d5334c0b8a1f4c2b60947c009fbeb63 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Wed, 30 Apr 2025 15:08:50 +0200 Subject: [PATCH 25/26] remove extra dev dep --- packages/hub/package.json | 3 --- packages/hub/pnpm-lock.yaml | 17 ----------------- 2 files changed, 20 deletions(-) diff --git a/packages/hub/package.json b/packages/hub/package.json index 15d2e31c8d..c880d2acf4 100644 --- a/packages/hub/package.json +++ b/packages/hub/package.json @@ -59,9 +59,6 @@ ], "author": "Hugging Face", "license": "MIT", - "devDependencies": { - "@types/node": "^20.11.28" - }, "dependencies": { "@huggingface/tasks": "workspace:^" } diff --git a/packages/hub/pnpm-lock.yaml b/packages/hub/pnpm-lock.yaml index 7310740bc0..a7ed8dd673 100644 --- a/packages/hub/pnpm-lock.yaml +++ b/packages/hub/pnpm-lock.yaml @@ -8,20 +8,3 @@ dependencies: '@huggingface/tasks': specifier: workspace:^ version: link:../tasks - -devDependencies: - '@types/node': - specifier: ^20.11.28 - version: 20.11.28 - -packages: - - /@types/node@20.11.28: - resolution: {integrity: sha512-M/GPWVS2wLkSkNHVeLkrF2fD5Lx5UC4PxA0uZcKc6QqbIQUJyW1jVjueJYi1z8n0I5PxYrtpnPnWglE+y9A0KA==} - dependencies: - undici-types: 5.26.5 - dev: true - - /undici-types@5.26.5: - resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} - dev: true From 524e98788c98b89f7ffd2aeae59b4268c7474491 Mon Sep 17 00:00:00 2001 From: coyotte508 Date: Mon, 5 May 2025 16:57:46 +0200 Subject: [PATCH 26/26] Use X-Linked-Etag header in priority --- packages/hub/src/lib/file-download-info.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/hub/src/lib/file-download-info.ts b/packages/hub/src/lib/file-download-info.ts index 50af612ac6..4350dc2c9d 100644 --- a/packages/hub/src/lib/file-download-info.ts +++ b/packages/hub/src/lib/file-download-info.ts @@ -131,7 +131,7 @@ export async function fileDownloadInfo( } } - const etag = resp.headers.get("ETag") ?? undefined; + const etag = resp.headers.get("X-Linked-ETag") ?? resp.headers.get("ETag") ?? undefined; if (!etag) { throw new InvalidApiResponseFormatError("Expected ETag");