feat: b6-http-proxy-v2 — batch 6

2026-06-14 10:22:14 +08:00 · 2026-04-07 15:52:11 +09:00
parent 8a4b613c39
commit 18d3c1918b
4 changed files with 558 additions and 0 deletions
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -1580,6 +1580,7 @@ version = "0.1.0"
 dependencies = [
 "api",
 "commands",
 "flate2",
 "plugins",
 "reqwest",
 "runtime",
--- a/rust/crates/tools/Cargo.toml
+++ b/rust/crates/tools/Cargo.toml
@@ -8,6 +8,7 @@ publish.workspace = true
 [dependencies]
 api = { path = "../api" }
 commands = { path = "../commands" }
 flate2 = "1"
 plugins = { path = "../plugins" }
 runtime = { path = "../runtime" }
 reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
--- a/rust/crates/tools/src/lib.rs
+++ b/rust/crates/tools/src/lib.rs
@@ -5306,6 +5306,7 @@ fn parse_skill_description(contents: &str) -> Option<String> {
 }
 pub mod lane_completion;
 pub mod pdf_extract;
 #[cfg(test)]
 mod tests {
--- a/rust/crates/tools/src/pdf_extract.rs
+++ b/rust/crates/tools/src/pdf_extract.rs
@@ -0,0 +1,555 @@
 //! Minimal PDF text extraction.
 //!
 //! Reads a PDF file, locates `/Contents` stream objects, decompresses with
 //! flate2 when the stream uses `/FlateDecode`, and extracts text operators
 //! found between `BT` / `ET` markers.
 use std::io::Read as _;
 use std::path::Path;
 /// Extract all readable text from a PDF file.
 ///
 /// Returns the concatenated text found inside BT/ET operators across all
 /// content streams.  Non-text pages or encrypted PDFs yield an empty string
 /// rather than an error.
 pub fn extract_text(path: &Path) -> Result<String, String> {
    let data = std::fs::read(path).map_err(|e| format!("failed to read PDF: {e}"))?;
    Ok(extract_text_from_bytes(&data))
 }
 /// Core extraction from raw PDF bytes — useful for testing without touching the
 /// filesystem.
 pub(crate) fn extract_text_from_bytes(data: &[u8]) -> String {
    let mut all_text = String::new();
    let mut offset = 0;
    while offset < data.len() {
        let Some(stream_start) = find_subsequence(&data[offset..], b"stream") else {
            break;
        };
        let abs_start = offset + stream_start;
        // Determine the byte offset right after "stream\r\n" or "stream\n".
        let content_start = skip_stream_eol(data, abs_start + b"stream".len());
        let Some(end_rel) = find_subsequence(&data[content_start..], b"endstream") else {
            break;
        };
        let content_end = content_start + end_rel;
        // Look backwards from "stream" for a FlateDecode hint in the object
        // dictionary.  We scan at most 512 bytes before the stream keyword.
        let dict_window_start = abs_start.saturating_sub(512);
        let dict_window = &data[dict_window_start..abs_start];
        let is_flate = find_subsequence(dict_window, b"FlateDecode").is_some();
        // Only process streams whose parent dictionary references /Contents or
        // looks like a page content stream (contains /Length).  We intentionally
        // keep this loose to cover both inline and referenced content streams.
        let raw = &data[content_start..content_end];
        let decompressed;
        let stream_bytes: &[u8] = if is_flate {
            match inflate(raw) {
                Ok(buf) => {
                    decompressed = buf;
                    &decompressed
                }
                Err(_) => {
                    offset = content_end;
                    continue;
                }
            }
        } else {
            raw
        };
        let text = extract_bt_et_text(stream_bytes);
        if !text.is_empty() {
            if !all_text.is_empty() {
                all_text.push('\n');
            }
            all_text.push_str(&text);
        }
        offset = content_end;
    }
    all_text
 }
 /// Inflate (zlib / deflate) compressed data via `flate2`.
 fn inflate(data: &[u8]) -> Result<Vec<u8>, String> {
    let mut decoder = flate2::read::ZlibDecoder::new(data);
    let mut buf = Vec::new();
    decoder
        .read_to_end(&mut buf)
        .map_err(|e| format!("flate2 inflate error: {e}"))?;
    Ok(buf)
 }
 /// Extract text from PDF content-stream operators between BT and ET markers.
 ///
 /// Handles the common text-showing operators:
 /// - `Tj`  — show a string
 /// - `TJ`  — show an array of strings/numbers
 /// - `'`   — move to next line and show string
 /// - `"`   — set spacing, move to next line and show string
 fn extract_bt_et_text(stream: &[u8]) -> String {
    let text = String::from_utf8_lossy(stream);
    let mut result = String::new();
    let mut in_bt = false;
    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed == "BT" {
            in_bt = true;
            continue;
        }
        if trimmed == "ET" {
            in_bt = false;
            continue;
        }
        if !in_bt {
            continue;
        }
        // Tj operator: (text) Tj
        if trimmed.ends_with("Tj") {
            if let Some(s) = extract_parenthesized_string(trimmed) {
                if !result.is_empty() && !result.ends_with('\n') {
                    result.push(' ');
                }
                result.push_str(&s);
            }
        }
        // TJ operator: [ (text) 123 (text) ] TJ
        else if trimmed.ends_with("TJ") {
            let extracted = extract_tj_array(trimmed);
            if !extracted.is_empty() {
                if !result.is_empty() && !result.ends_with('\n') {
                    result.push(' ');
                }
                result.push_str(&extracted);
            }
        }
        // ' operator: (text) '
        else if trimmed.ends_with('\'') && trimmed.len() > 1 {
            if let Some(s) = extract_parenthesized_string(trimmed) {
                if !result.is_empty() {
                    result.push('\n');
                }
                result.push_str(&s);
            }
        }
        // " operator: aw ac (text) "
        else if trimmed.ends_with('"') && trimmed.contains('(') {
            if let Some(s) = extract_parenthesized_string(trimmed) {
                if !result.is_empty() {
                    result.push('\n');
                }
                result.push_str(&s);
            }
        }
    }
    result
 }
 /// Pull the text from the first `(…)` group, handling escaped parens and
 /// common PDF escape sequences.
 fn extract_parenthesized_string(input: &str) -> Option<String> {
    let open = input.find('(')?;
    let bytes = input.as_bytes();
    let mut depth = 0;
    let mut result = String::new();
    let mut i = open;
    while i < bytes.len() {
        match bytes[i] {
            b'(' => {
                if depth > 0 {
                    result.push('(');
                }
                depth += 1;
            }
            b')' => {
                depth -= 1;
                if depth == 0 {
                    return Some(result);
                }
                result.push(')');
            }
            b'\\' if i + 1 < bytes.len() => {
                i += 1;
                match bytes[i] {
                    b'n' => result.push('\n'),
                    b'r' => result.push('\r'),
                    b't' => result.push('\t'),
                    b'\\' => result.push('\\'),
                    b'(' => result.push('('),
                    b')' => result.push(')'),
                    // Octal sequences — up to 3 digits.
                    d @ b'0'..=b'7' => {
                        let mut octal = (d - b'0') as u32;
                        for _ in 0..2 {
                            if i + 1 < bytes.len()
                                && bytes[i + 1].is_ascii_digit()
                                && bytes[i + 1] <= b'7'
                            {
                                i += 1;
                                octal = octal * 8 + (bytes[i] - b'0') as u32;
                            } else {
                                break;
                            }
                        }
                        if let Some(ch) = char::from_u32(octal) {
                            result.push(ch);
                        }
                    }
                    other => result.push(other as char),
                }
            }
            ch => result.push(ch as char),
        }
        i += 1;
    }
    None // unbalanced
 }
 /// Extract concatenated strings from a TJ array like `[ (Hello) -120 (World) ] TJ`.
 fn extract_tj_array(input: &str) -> String {
    let mut result = String::new();
    let Some(bracket_start) = input.find('[') else {
        return result;
    };
    let Some(bracket_end) = input.rfind(']') else {
        return result;
    };
    let inner = &input[bracket_start + 1..bracket_end];
    let mut i = 0;
    let bytes = inner.as_bytes();
    while i < bytes.len() {
        if bytes[i] == b'(' {
            // Reconstruct the parenthesized string and extract it.
            if let Some(s) = extract_parenthesized_string(&inner[i..]) {
                result.push_str(&s);
                // Skip past the closing paren.
                let mut depth = 0u32;
                for &b in &bytes[i..] {
                    i += 1;
                    if b == b'(' {
                        depth += 1;
                    } else if b == b')' {
                        depth -= 1;
                        if depth == 0 {
                            break;
                        }
                    }
                }
                continue;
            }
        }
        i += 1;
    }
    result
 }
 /// Skip past the end-of-line marker that immediately follows the `stream`
 /// keyword.  Per the PDF spec this is either `\r\n` or `\n`.
 fn skip_stream_eol(data: &[u8], pos: usize) -> usize {
    if pos < data.len() && data[pos] == b'\r' {
        if pos + 1 < data.len() && data[pos + 1] == b'\n' {
            return pos + 2;
        }
        return pos + 1;
    }
    if pos < data.len() && data[pos] == b'\n' {
        return pos + 1;
    }
    pos
 }
 /// Simple byte-subsequence search.
 fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
    haystack
        .windows(needle.len())
        .position(|window| window == needle)
 }
 /// Check if a user-supplied path looks like a PDF file reference.
 pub fn looks_like_pdf_path(text: &str) -> Option<&str> {
    for token in text.split_whitespace() {
        let cleaned = token.trim_matches(|c: char| c == '\'' || c == '"' || c == '`');
        if cleaned.ends_with(".pdf") || cleaned.ends_with(".PDF") {
            return Some(cleaned);
        }
    }
    None
 }
 /// Auto-extract text from a PDF path mentioned in a user prompt.
 ///
 /// Returns `Some((path, extracted_text))` when a `.pdf` path is detected and
 /// the file exists, otherwise `None`.
 pub fn maybe_extract_pdf_from_prompt(prompt: &str) -> Option<(String, String)> {
    let pdf_path = looks_like_pdf_path(prompt)?;
    let path = Path::new(pdf_path);
    if !path.exists() {
        return None;
    }
    let text = extract_text(path).ok()?;
    if text.is_empty() {
        return None;
    }
    Some((pdf_path.to_string(), text))
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Build a minimal valid PDF with a single page containing uncompressed
    /// text.  This is the smallest PDF structure that exercises the BT/ET
    /// extraction path.
    fn build_simple_pdf(text: &str) -> Vec<u8> {
        let content_stream = format!("BT\n/F1 12 Tf\n({text}) Tj\nET");
        let stream_bytes = content_stream.as_bytes();
        let mut pdf = Vec::new();
        // Header
        pdf.extend_from_slice(b"%PDF-1.4\n");
        // Object 1 — Catalog
        let obj1_offset = pdf.len();
        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
        // Object 2 — Pages
        let obj2_offset = pdf.len();
        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
        // Object 3 — Page
        let obj3_offset = pdf.len();
        pdf.extend_from_slice(
            b"3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>\nendobj\n",
        );
        // Object 4 — Content stream (uncompressed)
        let obj4_offset = pdf.len();
        let length = stream_bytes.len();
        let header = format!("4 0 obj\n<< /Length {length} >>\nstream\n");
        pdf.extend_from_slice(header.as_bytes());
        pdf.extend_from_slice(stream_bytes);
        pdf.extend_from_slice(b"\nendstream\nendobj\n");
        // Cross-reference table
        let xref_offset = pdf.len();
        pdf.extend_from_slice(b"xref\n0 5\n");
        pdf.extend_from_slice(format!("0000000000 65535 f \n").as_bytes());
        pdf.extend_from_slice(format!("{obj1_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(format!("{obj2_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(format!("{obj3_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(format!("{obj4_offset:010} 00000 n \n").as_bytes());
        // Trailer
        pdf.extend_from_slice(b"trailer\n<< /Size 5 /Root 1 0 R >>\n");
        pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
        pdf
    }
    /// Build a minimal PDF with flate-compressed content stream.
    fn build_flate_pdf(text: &str) -> Vec<u8> {
        use flate2::write::ZlibEncoder;
        use flate2::Compression;
        use std::io::Write as _;
        let content_stream = format!("BT\n/F1 12 Tf\n({text}) Tj\nET");
        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
        encoder
            .write_all(content_stream.as_bytes())
            .expect("compress");
        let compressed = encoder.finish().expect("finish");
        let mut pdf = Vec::new();
        pdf.extend_from_slice(b"%PDF-1.4\n");
        let obj1_offset = pdf.len();
        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
        let obj2_offset = pdf.len();
        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
        let obj3_offset = pdf.len();
        pdf.extend_from_slice(
            b"3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>\nendobj\n",
        );
        let obj4_offset = pdf.len();
        let length = compressed.len();
        let header = format!("4 0 obj\n<< /Length {length} /Filter /FlateDecode >>\nstream\n");
        pdf.extend_from_slice(header.as_bytes());
        pdf.extend_from_slice(&compressed);
        pdf.extend_from_slice(b"\nendstream\nendobj\n");
        let xref_offset = pdf.len();
        pdf.extend_from_slice(b"xref\n0 5\n");
        pdf.extend_from_slice(format!("0000000000 65535 f \n").as_bytes());
        pdf.extend_from_slice(format!("{obj1_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(format!("{obj2_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(format!("{obj3_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(format!("{obj4_offset:010} 00000 n \n").as_bytes());
        pdf.extend_from_slice(b"trailer\n<< /Size 5 /Root 1 0 R >>\n");
        pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
        pdf
    }
    #[test]
    fn extracts_uncompressed_text_from_minimal_pdf() {
        // given
        let pdf_bytes = build_simple_pdf("Hello World");
        // when
        let text = extract_text_from_bytes(&pdf_bytes);
        // then
        assert_eq!(text, "Hello World");
    }
    #[test]
    fn extracts_text_from_flate_compressed_stream() {
        // given
        let pdf_bytes = build_flate_pdf("Compressed PDF Text");
        // when
        let text = extract_text_from_bytes(&pdf_bytes);
        // then
        assert_eq!(text, "Compressed PDF Text");
    }
    #[test]
    fn handles_tj_array_operator() {
        // given
        let stream = b"BT\n/F1 12 Tf\n[ (Hello) -120 ( World) ] TJ\nET";
        let mut pdf = build_simple_pdf("");
        // Replace the content with our custom stream containing TJ
        let content_stream = std::str::from_utf8(stream).unwrap();
        let raw = format!(
            "%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n\
             2 0 obj\n<< /Length {} >>\nstream\n{}\nendstream\nendobj\n%%EOF\n",
            content_stream.len(),
            content_stream
        );
        let _ = pdf; // drop unused
        let pdf_bytes = raw.into_bytes();
        // when
        let text = extract_text_from_bytes(&pdf_bytes);
        // then
        assert_eq!(text, "Hello World");
    }
    #[test]
    fn handles_escaped_parentheses() {
        // given
        let content = b"BT\n(Hello \\(World\\)) Tj\nET";
        let raw = format!(
            "%PDF-1.4\n1 0 obj\n<< /Length {} >>\nstream\n",
            content.len()
        );
        let mut pdf_bytes = raw.into_bytes();
        pdf_bytes.extend_from_slice(content);
        pdf_bytes.extend_from_slice(b"\nendstream\nendobj\n%%EOF\n");
        // when
        let text = extract_text_from_bytes(&pdf_bytes);
        // then
        assert_eq!(text, "Hello (World)");
    }
    #[test]
    fn returns_empty_for_non_pdf_data() {
        // given
        let data = b"This is not a PDF file at all";
        // when
        let text = extract_text_from_bytes(data);
        // then
        assert!(text.is_empty());
    }
    #[test]
    fn extracts_text_from_file_on_disk() {
        // given
        let pdf_bytes = build_simple_pdf("Disk Test");
        let dir = std::env::temp_dir().join("clawd-pdf-extract-test");
        std::fs::create_dir_all(&dir).unwrap();
        let pdf_path = dir.join("test.pdf");
        std::fs::write(&pdf_path, &pdf_bytes).unwrap();
        // when
        let text = extract_text(&pdf_path).unwrap();
        // then
        assert_eq!(text, "Disk Test");
        // cleanup
        let _ = std::fs::remove_dir_all(&dir);
    }
    #[test]
    fn looks_like_pdf_path_detects_pdf_references() {
        // given / when / then
        assert_eq!(
            looks_like_pdf_path("Please read /tmp/report.pdf"),
            Some("/tmp/report.pdf")
        );
        assert_eq!(
            looks_like_pdf_path("Check 'my file.PDF' now"),
            Some("my file.PDF")
        );
        assert_eq!(looks_like_pdf_path("no pdf here"), None);
    }
    #[test]
    fn maybe_extract_pdf_from_prompt_returns_none_for_missing_file() {
        // given
        let prompt = "Read /tmp/nonexistent-abc123.pdf please";
        // when
        let result = maybe_extract_pdf_from_prompt(prompt);
        // then
        assert!(result.is_none());
    }
    #[test]
    fn maybe_extract_pdf_from_prompt_extracts_existing_file() {
        // given
        let pdf_bytes = build_simple_pdf("Auto Extracted");
        let dir = std::env::temp_dir().join("clawd-pdf-auto-extract-test");
        std::fs::create_dir_all(&dir).unwrap();
        let pdf_path = dir.join("auto.pdf");
        std::fs::write(&pdf_path, &pdf_bytes).unwrap();
        let prompt = format!("Summarize {}", pdf_path.display());
        // when
        let result = maybe_extract_pdf_from_prompt(&prompt);
        // then
        let (path, text) = result.expect("should extract");
        assert_eq!(path, pdf_path.display().to_string());
        assert_eq!(text, "Auto Extracted");
        // cleanup
        let _ = std::fs::remove_dir_all(&dir);
    }
 }