fix: inline http(s) image URLs for Moonshot vision models (#2929) (#2931)

Co-authored-by: devareddy05 <pdevendrareddy@guidewire.com>
This commit is contained in:
Devendra Reddy Pennabadi
2026-05-06 23:23:52 +05:30
committed by GitHub
parent ff8c275317
commit 7a1a885e89
3 changed files with 328 additions and 0 deletions
@@ -27,6 +27,7 @@ import type {
ICompleteArguments,
IChatCompleteResult,
} from '../../types.js';
import { inlineHttpImageUrls } from './imageHandling.js';
import { MOONSHOT_MODELS } from './models.js';
export class MoonshotProvider implements IChatProvider {
@@ -74,6 +75,13 @@ export class MoonshotProvider implements IChatProvider {
availableModels.find((m) =>
[m.id, ...(m.aliases || [])].includes(model),
) || availableModels.find((m) => m.id === this.getDefaultModel())!;
// Moonshot's vision API doesn't fetch http(s) URLs; inline them
// so callers can pass plain links like other vision providers.
if (modelUsed.modalities?.input?.includes('image')) {
await inlineHttpImageUrls(messages);
}
messages = await OpenAIUtil.process_input_messages(messages);
let completion;
try {
@@ -0,0 +1,208 @@
/*
* Copyright (C) 2024-present Puter Technologies Inc.
*
* This file is part of Puter.
*
* Puter is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { afterEach, describe, expect, it, vi } from 'vitest';
vi.mock('../../../../util/secureHttp.js', () => ({
secureFetch: vi.fn(),
}));
import { secureFetch } from '../../../../util/secureHttp.js';
import { inlineHttpImageUrls, MAX_IMAGE_BYTES } from './imageHandling.js';
const mockedSecureFetch = vi.mocked(secureFetch);
const buildResponse = (
body: Buffer | ArrayBuffer,
{
status = 200,
contentType = 'image/png',
contentLength,
}: {
status?: number;
contentType?: string | null;
contentLength?: string;
} = {},
): Response => {
const buf = Buffer.isBuffer(body) ? body : Buffer.from(body);
const headers = new Headers();
if (contentType) headers.set('content-type', contentType);
if (contentLength) headers.set('content-length', contentLength);
return {
ok: status >= 200 && status < 300,
status,
headers,
arrayBuffer: async () =>
buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength),
} as unknown as Response;
};
describe('inlineHttpImageUrls', () => {
afterEach(() => {
mockedSecureFetch.mockReset();
});
it('rewrites http(s) image URLs to base64 data URIs', async () => {
const png = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
mockedSecureFetch.mockResolvedValueOnce(
buildResponse(png, { contentType: 'image/png' }),
);
const messages = [
{
role: 'user',
content: [
{ type: 'text', text: 'what is this' },
{ image_url: { url: 'https://example.com/cat.png' } },
],
},
];
await inlineHttpImageUrls(messages);
expect(mockedSecureFetch).toHaveBeenCalledWith(
'https://example.com/cat.png',
);
const part = messages[0].content[1] as {
type?: string;
image_url?: { url?: string };
};
expect(part.type).toBe('image_url');
expect(part.image_url?.url).toBe(
`data:image/png;base64,${png.toString('base64')}`,
);
});
it('leaves data URIs untouched and skips fetching', async () => {
const messages = [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: 'data:image/png;base64,AAAA',
},
},
],
},
];
await inlineHttpImageUrls(messages);
expect(mockedSecureFetch).not.toHaveBeenCalled();
const part = messages[0].content[0] as { image_url?: { url?: string } };
expect(part.image_url?.url).toBe('data:image/png;base64,AAAA');
});
it('replaces oversized images with a text error block', async () => {
const oversize = Buffer.alloc(10);
mockedSecureFetch.mockResolvedValueOnce(
buildResponse(oversize, {
contentType: 'image/jpeg',
contentLength: String(MAX_IMAGE_BYTES + 1),
}),
);
const messages = [
{
role: 'user',
content: [
{ image_url: { url: 'https://example.com/huge.jpg' } },
],
},
];
await inlineHttpImageUrls(messages);
const part = messages[0].content[0] as {
type?: string;
text?: string;
image_url?: unknown;
};
expect(part.type).toBe('text');
expect(part.image_url).toBeUndefined();
expect(part.text).toContain('exceeds maximum');
});
it('replaces non-image responses with a text error block', async () => {
mockedSecureFetch.mockResolvedValueOnce(
buildResponse(Buffer.from('<html/>'), {
contentType: 'text/html',
}),
);
const messages = [
{
role: 'user',
content: [
{ image_url: { url: 'https://example.com/page' } },
],
},
];
await inlineHttpImageUrls(messages);
const part = messages[0].content[0] as {
type?: string;
text?: string;
};
expect(part.type).toBe('text');
expect(part.text).toContain('expected an image');
});
it('replaces fetch failures with a text error block', async () => {
mockedSecureFetch.mockRejectedValueOnce(new Error('boom'));
const messages = [
{
role: 'user',
content: [
{ image_url: { url: 'https://example.com/x.png' } },
],
},
];
await inlineHttpImageUrls(messages);
const part = messages[0].content[0] as {
type?: string;
text?: string;
};
expect(part.type).toBe('text');
expect(part.text).toContain('boom');
});
it('ignores non-image-url parts and string content', async () => {
const messages = [
{ role: 'user', content: 'plain text' },
{
role: 'user',
content: [
{ type: 'text', text: 'still here' },
{ type: 'tool_use', id: 't', name: 'x', input: {} },
],
},
];
await inlineHttpImageUrls(messages);
expect(mockedSecureFetch).not.toHaveBeenCalled();
});
});
@@ -0,0 +1,112 @@
/*
* Copyright (C) 2024-present Puter Technologies Inc.
*
* This file is part of Puter.
*
* Puter is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { secureFetch } from '../../../../util/secureHttp.js';
// Matches the OpenAI Chat-Completions inline-upload cap.
export const MAX_IMAGE_BYTES = 5 * 1_000_000;
interface ImageContentPart {
type?: string;
text?: string;
image_url?: { url?: string };
}
interface MessageWithContent {
content?: unknown;
}
// Moonshot's vision API rejects http(s) image URLs and only accepts base64
// data URIs or file-id refs, so any web URL must be fetched and inlined.
// Failures become inline text-error parts (same shape as openai/fileUpload.ts).
export async function inlineHttpImageUrls(
messages: MessageWithContent[],
): Promise<void> {
const tasks: Array<Promise<void>> = [];
for (const message of messages) {
if (!Array.isArray(message.content)) continue;
for (const part of message.content as ImageContentPart[]) {
const url = part?.image_url?.url;
if (!url) continue;
if (!url.startsWith('http://') && !url.startsWith('https://')) {
continue;
}
tasks.push(inlineOne(part, url));
}
}
await Promise.all(tasks);
}
async function inlineOne(part: ImageContentPart, url: string): Promise<void> {
try {
const response = await secureFetch(url);
if (!response.ok) {
setTextError(
part,
`failed to fetch image (status ${response.status})`,
);
return;
}
const contentLength = Number(
response.headers.get('content-length') ?? NaN,
);
if (Number.isFinite(contentLength) && contentLength > MAX_IMAGE_BYTES) {
setTextError(
part,
`image exceeds maximum of ${MAX_IMAGE_BYTES} bytes`,
);
return;
}
const arrayBuf = await response.arrayBuffer();
if (arrayBuf.byteLength > MAX_IMAGE_BYTES) {
setTextError(
part,
`image exceeds maximum of ${MAX_IMAGE_BYTES} bytes`,
);
return;
}
const mimeType = (response.headers.get('content-type') ?? '')
.split(';')[0]
?.trim();
if (!mimeType || !mimeType.startsWith('image/')) {
setTextError(
part,
`expected an image, got ${mimeType || 'unknown MIME type'}`,
);
return;
}
const base64 = Buffer.from(arrayBuf).toString('base64');
part.type = 'image_url';
part.image_url = { url: `data:${mimeType};base64,${base64}` };
} catch (err) {
const message = (err as Error)?.message || 'failed to fetch image';
setTextError(part, message);
}
}
function setTextError(part: ImageContentPart, reason: string): void {
delete part.image_url;
part.type = 'text';
// Phrasing matches openai/fileUpload.ts so the model reads it as a
// system note, not user input.
part.text = `{error: ${reason}; the user did not write this message}`;
}