Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 48 additions & 51 deletions server/bleep/src/agent/transcoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,45 +494,12 @@ fn xml_for_each(article: &str, f: impl Fn(&str) -> Option<String>) -> String {
}

fn try_trim_code_xml(xml: &str) -> Result<String> {
let xml = fixup_xml_code(xml);

let code_chunk = quick_xml::de::from_str(&xml).context("couldn't parse as XML code block")?;

Ok(match code_chunk {
CodeChunk::QuotedCode {
code: _,
language,
path,
start_line,
end_line,
} => {
let start_line = start_line
.map(|n| format!("<StartLine>{n}</StartLine>\n"))
.unwrap_or_default();
let end_line = end_line
.map(|n| format!("<EndLine>{n}</EndLine>\n"))
.unwrap_or_default();

format!(
"<QuotedCode>\n\
<Code>[REDACTED]</Code>\n\
<Language>{language}</Language>\n\
<Path>{path}</Path>\n\
{start_line}\
{end_line}\
</QuotedCode>"
)
}
// We just remove code chunks completely.

CodeChunk::GeneratedCode { code: _, language } => {
format!(
"<GeneratedCode>\n\
<Code>[REDACTED]</Code>\n\
<Language>{language}</Language>\n\
</GeneratedCode>"
)
}
})
let xml = fixup_xml_code(xml);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to do this work at all if we're returning an empty string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do need to do this for partially generated messages. E.g. if a message generates with <GeneratedCode><Code>foo, we need to call fixup_xml_code to complete the block so that it can still be parsed on the line right below. If parsing fails here (e.g. there is a different kind of XML block, perhaps HTML in the markdown), we know the block is not a code chunk and should be kept.

let _code_chunk: CodeChunk =
quick_xml::de::from_str(&xml).context("couldn't parse as XML code block")?;
Ok(String::new())
}

pub fn limit_tokens(text: &str, bpe: CoreBPE, max_tokens: usize) -> &str {
Expand Down Expand Up @@ -587,19 +554,58 @@ test";

let expected = "Sample Markdown test.





test
test
test";

let out = xml_for_each(input, |code| try_trim_code_xml(code).ok());

assert_eq!(expected, out);
}

#[test]
fn test_trim_code_with_regular_xml() {
let input = "Sample Markdown test.

<p>hello</p>

<QuotedCode>
<Code>[REDACTED]</Code>
<Code>
fn foo() -> i32 {
42
}
</Code>
<Language>Rust</Language>
<Path>src/main.rs</Path>
<StartLine>10</StartLine>
<EndLine>12</EndLine>
</QuotedCode>

<GeneratedCode>
<Code>[REDACTED]</Code>
<Code>
fn foo() -> i32 {
42
}
</Code>
<Language>Rust</Language>
</GeneratedCode>

test
test
test";

let expected = "Sample Markdown test.

<p>hello</p>





test
test
test";
Expand Down Expand Up @@ -1072,20 +1078,11 @@ fn main() {

let expected = "Foo

<QuotedCode>
<Code>[REDACTED]</Code>
<Language>Rust</Language>
<Path>src/main.rs</Path>
<StartLine>1</StartLine>
<EndLine>3</EndLine>
</QuotedCode>


Bar.

<GeneratedCode>
<Code>[REDACTED]</Code>
<Language>Rust</Language>
</GeneratedCode>


[^summary]: Test **summary**.";

Expand Down