From e370450dabb09118be5b895b2b0ce5fa8de505ee Mon Sep 17 00:00:00 2001 From: Dmitry Ng <19asdek91@gmail.com> Date: Sun, 3 May 2026 00:45:27 +0300 Subject: [PATCH] feat(browser): enhance HTML and MD content handling with warnings for small content and errors for binary URLs - Updated `getHTML` and `getMD` methods to return warnings for small content instead of errors. - Implemented checks for binary URLs, returning descriptive errors when such URLs are encountered. - Added new tests to validate the updated behavior for small and empty content handling. --- backend/pkg/tools/browser.go | 50 +++++++++- backend/pkg/tools/browser_test.go | 147 ++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 8 deletions(-) diff --git a/backend/pkg/tools/browser.go b/backend/pkg/tools/browser.go index 593d127..8eaa75b 100644 --- a/backend/pkg/tools/browser.go +++ b/backend/pkg/tools/browser.go @@ -27,6 +27,32 @@ const ( minImgContentSize = 2048 ) +// nonHTMLExtensions lists URL path suffixes that point to resources the scraper +// cannot render as text. When a URL matches, the browser tool returns a +// descriptive hint instead of a generic "content too small" error. +var nonHTMLExtensions = []string{ + ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", + ".zip", ".tar", ".gz", ".bz2", ".rar", ".7z", + ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg", ".ico", + ".mp3", ".mp4", ".avi", ".mov", ".mkv", ".wav", + ".exe", ".bin", ".dll", ".so", ".dmg", ".apk", +} + +// isBinaryURL returns true when the URL points to a known non-HTML resource. +func isBinaryURL(rawURL string) bool { + lower := strings.ToLower(rawURL) + // strip query string for extension matching + if idx := strings.Index(lower, "?"); idx != -1 { + lower = lower[:idx] + } + for _, ext := range nonHTMLExtensions { + if strings.HasSuffix(lower, ext) { + return true + } + } + return false +} + var localZones = []string{ ".localdomain", ".local", @@ -334,6 +360,13 @@ func (b *browser) saveScreenshotData(screenshot []byte) (string, error) { } func (b *browser) getMD(targetURL string) (string, error) { + if isBinaryURL(targetURL) { + return "", fmt.Errorf( + "the URL appears to point to a binary/non-HTML resource (e.g. PDF, image, archive) " + + "that cannot be rendered as markdown. Use the terminal tool with curl/wget to download it instead", + ) + } + scraperURL, err := b.resolveUrl(targetURL) if err != nil { return "", fmt.Errorf("failed to resolve url: %w", err) @@ -349,13 +382,23 @@ func (b *browser) getMD(targetURL string) (string, error) { return "", fmt.Errorf("failed to fetch content by url '%s': %w", targetURL, err) } if len(content) < minMdContentSize { - return "", fmt.Errorf("content size is less than minimum: %d bytes", minMdContentSize) + return fmt.Sprintf( + "[WARNING: page returned very little content (%d bytes), it may be a redirect, error page, or near-empty]\n\n%s", + len(content), string(content), + ), nil } return string(content), nil } func (b *browser) getHTML(targetURL string) (string, error) { + if isBinaryURL(targetURL) { + return "", fmt.Errorf( + "the URL appears to point to a binary/non-HTML resource (e.g. PDF, image, archive) " + + "that cannot be rendered as HTML. Use the terminal tool with curl/wget to download it instead", + ) + } + scraperURL, err := b.resolveUrl(targetURL) if err != nil { return "", fmt.Errorf("failed to resolve url: %w", err) @@ -371,7 +414,10 @@ func (b *browser) getHTML(targetURL string) (string, error) { return "", fmt.Errorf("failed to fetch content by url '%s': %w", targetURL, err) } if len(content) < minHtmlContentSize { - return "", fmt.Errorf("content size is less than minimum: %d bytes", minHtmlContentSize) + return fmt.Sprintf( + "[WARNING: page returned very little HTML content (%d bytes), it may be a redirect, error page, or near-empty]\n\n%s", + len(content), string(content), + ), nil } return string(content), nil diff --git a/backend/pkg/tools/browser_test.go b/backend/pkg/tools/browser_test.go index b76b878..34d5ac2 100644 --- a/backend/pkg/tools/browser_test.go +++ b/backend/pkg/tools/browser_test.go @@ -365,10 +365,10 @@ func TestContentMD_BothSucceed_ReturnsContentAndScreenshot(t *testing.T) { } } -func TestGetHTML_UsesCorrectMinContentSize(t *testing.T) { +func TestGetHTML_SmallContent_ReturnsWarning(t *testing.T) { // Serve HTML content that is larger than minMdContentSize (50) - // but smaller than minHtmlContentSize (300). - // With the fix, getHTML should reject this; before the fix it would accept it. + // but smaller than minHtmlContentSize (300). With the updated behaviour, + // getHTML must return the content with a WARNING prefix instead of an error. smallHTML := strings.Repeat("x", minMdContentSize+10) // 60 bytes: > 50, < 300 ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -382,12 +382,147 @@ func TestGetHTML_UsesCorrectMinContentSize(t *testing.T) { scPubURL: ts.URL, } + result, err := b.getHTML("https://example.com/page") + if err != nil { + t.Fatalf("getHTML() should not error for small-but-non-empty content, got: %v", err) + } + if !strings.Contains(result, "[WARNING:") { + t.Errorf("getHTML() should prefix result with WARNING for small content, got: %q", result) + } + if !strings.Contains(result, smallHTML) { + t.Errorf("getHTML() should include the actual content in the result") + } +} + +func TestGetHTML_EmptyContent_ReturnsError(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + // intentionally write nothing — callScraper detects empty body and errors + })) + defer ts.Close() + + b := &browser{flowID: 1, scPubURL: ts.URL} + _, err := b.getHTML("https://example.com/page") if err == nil { - t.Fatal("getHTML() should reject content smaller than minHtmlContentSize") + t.Fatal("getHTML() should error for empty content") } - if !strings.Contains(err.Error(), fmt.Sprintf("%d bytes", minHtmlContentSize)) { - t.Errorf("getHTML() error should reference minHtmlContentSize (%d), got: %v", minHtmlContentSize, err) + // callScraper wraps the error as "failed to fetch content by url '...': empty response body..." + if !strings.Contains(err.Error(), "failed to fetch content") { + t.Errorf("getHTML() error should come from callScraper, got: %v", err) + } +} + +func TestGetHTML_BinaryURL_ReturnsError(t *testing.T) { + b := &browser{flowID: 1, scPubURL: "http://127.0.0.1:1"} + + _, err := b.getHTML("https://example.com/report.pdf") + if err == nil { + t.Fatal("getHTML() should error for binary URL") + } + if !strings.Contains(err.Error(), "binary") { + t.Errorf("getHTML() error should mention binary resource, got: %v", err) + } + if !strings.Contains(err.Error(), "curl") { + t.Errorf("getHTML() error should hint about curl/wget, got: %v", err) + } +} + +func TestGetMD_SmallContent_ReturnsWarning(t *testing.T) { + tiny := strings.Repeat("x", minMdContentSize-1) // just below minimum, but not 0 + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, tiny) + })) + defer ts.Close() + + b := &browser{flowID: 1, scPubURL: ts.URL} + + result, err := b.getMD("https://example.com/page") + if err != nil { + t.Fatalf("getMD() should not error for small-but-non-empty content, got: %v", err) + } + if !strings.Contains(result, "[WARNING:") { + t.Errorf("getMD() should prefix result with WARNING for small content, got: %q", result) + } + if !strings.Contains(result, tiny) { + t.Errorf("getMD() should include the actual content in the result") + } +} + +func TestGetMD_EmptyContent_ReturnsError(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + // intentionally write nothing — callScraper detects empty body and errors + })) + defer ts.Close() + + b := &browser{flowID: 1, scPubURL: ts.URL} + + _, err := b.getMD("https://example.com/page") + if err == nil { + t.Fatal("getMD() should error for empty content") + } + if !strings.Contains(err.Error(), "failed to fetch content") { + t.Errorf("getMD() error should come from callScraper, got: %v", err) + } +} + +func TestGetMD_BinaryURL_ReturnsError(t *testing.T) { + b := &browser{flowID: 1, scPubURL: "http://127.0.0.1:1"} + + for _, url := range []string{ + "https://example.com/file.pdf", + "https://host.io/archive.zip", + "https://cdn.example.com/image.png", + "https://example.com/data.xlsx?token=abc", + } { + _, err := b.getMD(url) + if err == nil { + t.Errorf("getMD(%q) should error for binary URL", url) + continue + } + if !strings.Contains(err.Error(), "binary") { + t.Errorf("getMD(%q) error should mention binary resource, got: %v", url, err) + } + } +} + +func TestIsBinaryURL(t *testing.T) { + tests := []struct { + url string + want bool + }{ + {"https://example.com/report.pdf", true}, + {"https://example.com/REPORT.PDF", true}, + {"https://cdn.example.com/image.png", true}, + {"https://cdn.example.com/photo.jpg", true}, + {"https://cdn.example.com/photo.JPEG", true}, + {"https://example.com/archive.zip", true}, + {"https://example.com/data.xlsx", true}, + {"https://example.com/slides.pptx", true}, + {"https://example.com/file.exe", true}, + {"https://example.com/lib.so", true}, + {"https://example.com/video.mp4", true}, + // query string is stripped before extension check + {"https://example.com/file.pdf?token=abc&download=1", true}, + // non-binary URLs + {"https://example.com/page", false}, + {"https://example.com/report.html", false}, + {"https://example.com/api/data.json", false}, + {"https://example.com/path/to/index", false}, + // pdf in path component but not as suffix + {"https://example.com/pdf-guide/intro", false}, + } + + for _, tt := range tests { + t.Run(tt.url, func(t *testing.T) { + got := isBinaryURL(tt.url) + if got != tt.want { + t.Errorf("isBinaryURL(%q) = %v, want %v", tt.url, got, tt.want) + } + }) } }