fix: optimise reading large file content (#3767)

This commit is contained in:
Michael Neale
2025-08-06 09:38:52 +10:00
committed by GitHub
parent 44526de207
commit 8f54fa84a5
5 changed files with 369 additions and 121 deletions

View File

@@ -14,5 +14,5 @@ that you can call the functionality from the server from the typescript.
tips:
- can look at unstaged changes for what is being worked on if starting
- always check rust compiles, cargo fmt etc and cargo clippy -- -D warnings (as well as run tests in files you are working on)
- always check rust compiles, cargo fmt etc and `./scripts/clippy-lint.sh` (as well as run tests in files you are working on)
- in ui/desktop, look at how you can run lint checks and if other tests can run

View File

@@ -67,8 +67,6 @@ impl EditorModelImpl for MorphLLMEditor {
_old_str: &str,
update_snippet: &str,
) -> Result<String, String> {
eprintln!("Calling MorphLLM Editor API");
// Construct the full URL
let provider_url = if self.host.ends_with("/chat/completions") {
self.host.clone()
@@ -128,7 +126,6 @@ impl EditorModelImpl for MorphLLMEditor {
.and_then(|content| content.as_str())
.ok_or_else(|| "Invalid response format".to_string())?;
eprintln!("MorphLLM Editor API worked");
Ok(content.to_string())
}

View File

@@ -48,6 +48,7 @@ use ignore::gitignore::{Gitignore, GitignoreBuilder};
// Embeds the prompts directory to the build
static PROMPTS_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/src/developer/prompts");
const LINE_READ_LIMIT: usize = 2000;
/// Loads prompt files from the embedded PROMPTS_DIR and returns a HashMap of prompts.
/// Ensures that each prompt name is unique.
@@ -823,130 +824,151 @@ impl DeveloperRouter {
}
}
// Helper method to validate and calculate view range indices
fn calculate_view_range(
&self,
view_range: Option<(usize, i64)>,
total_lines: usize,
) -> Result<(usize, usize), ToolError> {
if let Some((start_line, end_line)) = view_range {
// Convert 1-indexed line numbers to 0-indexed
let start_idx = if start_line > 0 { start_line - 1 } else { 0 };
let end_idx = if end_line == -1 {
total_lines
} else {
std::cmp::min(end_line as usize, total_lines)
};
if start_idx >= total_lines {
return Err(ToolError::InvalidParameters(format!(
"Start line {} is beyond the end of the file (total lines: {})",
start_line, total_lines
)));
}
if start_idx >= end_idx {
return Err(ToolError::InvalidParameters(format!(
"Start line {} must be less than end line {}",
start_line, end_line
)));
}
Ok((start_idx, end_idx))
} else {
Ok((0, total_lines))
}
}
// Helper method to format file content with line numbers
fn format_file_content(
&self,
path: &Path,
lines: &[&str],
start_idx: usize,
end_idx: usize,
view_range: Option<(usize, i64)>,
) -> String {
let display_content = if lines.is_empty() {
String::new()
} else {
let selected_lines: Vec<String> = lines[start_idx..end_idx]
.iter()
.enumerate()
.map(|(i, line)| format!("{}: {}", start_idx + i + 1, line))
.collect();
selected_lines.join("\n")
};
let language = lang::get_language_identifier(path);
if view_range.is_some() {
formatdoc! {"
### {path} (lines {start}-{end})
```{language}
{content}
```
",
path=path.display(),
start=view_range.unwrap().0,
end=if view_range.unwrap().1 == -1 { "end".to_string() } else { view_range.unwrap().1.to_string() },
language=language,
content=display_content,
}
} else {
formatdoc! {"
### {path}
```{language}
{content}
```
",
path=path.display(),
language=language,
content=display_content,
}
}
}
async fn text_editor_view(
&self,
path: &PathBuf,
view_range: Option<(usize, i64)>,
) -> Result<Vec<Content>, ToolError> {
if path.is_file() {
// Check file size first (400KB limit)
const MAX_FILE_SIZE: u64 = 400 * 1024; // 400KB in bytes
let f = File::open(path)
.map_err(|e| ToolError::ExecutionError(format!("Failed to open file: {}", e)))?;
let file_size = f
.metadata()
.map_err(|e| {
ToolError::ExecutionError(format!("Failed to get file metadata: {}", e))
})?
.len();
if file_size > MAX_FILE_SIZE {
return Err(ToolError::ExecutionError(format!(
"File '{}' is too large ({:.2}KB). Maximum size is 400KB to prevent memory issues.",
path.display(),
file_size as f64 / 1024.0
)));
}
// Ensure we never read over that limit even if the file is being concurrently mutated
// (e.g. it's a log file)
let mut f = f.take(MAX_FILE_SIZE);
let uri = Url::from_file_path(path)
.map_err(|_| ToolError::ExecutionError("Invalid file path".into()))?
.to_string();
let mut content = String::new();
f.read_to_string(&mut content)
.map_err(|e| ToolError::ExecutionError(format!("Failed to read file: {}", e)))?;
let lines: Vec<&str> = content.lines().collect();
let total_lines = lines.len();
// Handle view_range if provided, otherwise show all lines
let (start_idx, end_idx) = if let Some((start_line, end_line)) = view_range {
// Convert 1-indexed line numbers to 0-indexed
let start_idx = if start_line > 0 { start_line - 1 } else { 0 };
let end_idx = if end_line == -1 {
total_lines
} else {
std::cmp::min(end_line as usize, total_lines)
};
if start_idx >= total_lines {
return Err(ToolError::InvalidParameters(format!(
"Start line {} is beyond the end of the file (total lines: {})",
start_line, total_lines
)));
}
if start_idx >= end_idx {
return Err(ToolError::InvalidParameters(format!(
"Start line {} must be less than end line {}",
start_line, end_line
)));
}
(start_idx, end_idx)
} else {
(0, total_lines)
};
// Always format lines with line numbers for better usability
let display_content = if total_lines == 0 {
String::new()
} else {
let selected_lines: Vec<String> = lines[start_idx..end_idx]
.iter()
.enumerate()
.map(|(i, line)| format!("{}: {}", start_idx + i + 1, line))
.collect();
selected_lines.join("\n")
};
let language = lang::get_language_identifier(path);
let formatted = if view_range.is_some() {
formatdoc! {"
### {path} (lines {start}-{end})
```{language}
{content}
```
",
path=path.display(),
start=view_range.unwrap().0,
end=if view_range.unwrap().1 == -1 { "end".to_string() } else { view_range.unwrap().1.to_string() },
language=language,
content=display_content,
}
} else {
formatdoc! {"
### {path}
```{language}
{content}
```
",
path=path.display(),
language=language,
content=display_content,
}
};
// The LLM gets just a quick update as we expect the file to view in the status
// but we send a low priority message for the human
Ok(vec![
Content::embedded_text(uri, content).with_audience(vec![Role::Assistant]),
Content::text(formatted)
.with_audience(vec![Role::User])
.with_priority(0.0),
])
} else {
Err(ToolError::ExecutionError(format!(
if !path.is_file() {
return Err(ToolError::ExecutionError(format!(
"The path '{}' does not exist or is not a file.",
path.display()
)))
)));
}
const MAX_FILE_SIZE: u64 = 400 * 1024; // 400KB
let f = File::open(path)
.map_err(|e| ToolError::ExecutionError(format!("Failed to open file: {}", e)))?;
let file_size = f
.metadata()
.map_err(|e| ToolError::ExecutionError(format!("Failed to get file metadata: {}", e)))?
.len();
if file_size > MAX_FILE_SIZE {
return Err(ToolError::ExecutionError(format!(
"File '{}' is too large ({:.2}KB). Maximum size is 400KB to prevent memory issues.",
path.display(),
file_size as f64 / 1024.0
)));
}
// Ensure we never read over that limit even if the file is being concurrently mutated
let mut f = f.take(MAX_FILE_SIZE);
let uri = Url::from_file_path(path)
.map_err(|_| ToolError::ExecutionError("Invalid file path".into()))?
.to_string();
let mut content = String::new();
f.read_to_string(&mut content)
.map_err(|e| ToolError::ExecutionError(format!("Failed to read file: {}", e)))?;
let lines: Vec<&str> = content.lines().collect();
let total_lines = lines.len();
// We will gently encourage the LLM to specify a range for large line count files
// it can of course specify exact range to read any size file
if view_range.is_none() && total_lines > LINE_READ_LIMIT {
return recommend_read_range(path, total_lines);
}
let (start_idx, end_idx) = self.calculate_view_range(view_range, total_lines)?;
let formatted = self.format_file_content(path, &lines, start_idx, end_idx, view_range);
// The LLM gets just a quick update as we expect the file to view in the status
// but we send a low priority message for the human
Ok(vec![
Content::embedded_text(uri, content).with_audience(vec![Role::Assistant]),
Content::text(formatted)
.with_audience(vec![Role::User])
.with_priority(0.0),
])
}
async fn text_editor_write(
@@ -1466,6 +1488,15 @@ impl DeveloperRouter {
}
}
fn recommend_read_range(path: &Path, total_lines: usize) -> Result<Vec<Content>, ToolError> {
Err(ToolError::ExecutionError(format!(
"File '{}' is {} lines long, recommended to read in with view_range (or searching) to get bite size content. If you do wish to read all the file, please pass in view_range with [1, {}] to read it all at once",
path.display(),
total_lines,
total_lines
)))
}
impl Router for DeveloperRouter {
fn name(&self) -> String {
"developer".to_string()
@@ -1558,7 +1589,7 @@ impl Clone for DeveloperRouter {
instructions: self.instructions.clone(),
file_history: Arc::clone(&self.file_history),
ignore_patterns: Arc::clone(&self.ignore_patterns),
editor_model: create_editor_model(), // Recreate the editor model since it's not Clone
editor_model: create_editor_model(),
}
}
}
@@ -3084,6 +3115,226 @@ mod tests {
temp_dir.close().unwrap();
}
#[tokio::test]
#[serial]
async fn test_text_editor_view_large_file_without_range() {
let router = get_router().await;
let temp_dir = tempfile::tempdir().unwrap();
let file_path = temp_dir.path().join("large_file.txt");
let file_path_str = file_path.to_str().unwrap();
std::env::set_current_dir(&temp_dir).unwrap();
// Create a file with more than LINE_READ_LIMIT lines
let mut content = String::new();
for i in 1..=LINE_READ_LIMIT + 1 {
content.push_str(&format!("Line {}\n", i));
}
router
.call_tool(
"text_editor",
json!({
"command": "write",
"path": file_path_str,
"file_text": content
}),
dummy_sender(),
)
.await
.unwrap();
// Test viewing without view_range - should trigger the error
let result = router
.call_tool(
"text_editor",
json!({
"command": "view",
"path": file_path_str
}),
dummy_sender(),
)
.await;
assert!(result.is_err());
let err = result.err().unwrap();
assert!(matches!(err, ToolError::ExecutionError(_)));
assert!(err.to_string().contains("2001 lines long"));
assert!(err
.to_string()
.contains("recommended to read in with view_range"));
assert!(err
.to_string()
.contains("please pass in view_range with [1, 2001]"));
// Test viewing with view_range - should work
let result = router
.call_tool(
"text_editor",
json!({
"command": "view",
"path": file_path_str,
"view_range": [1, 100]
}),
dummy_sender(),
)
.await;
assert!(result.is_ok());
let view_result = result.unwrap();
let text = view_result
.iter()
.find(|c| {
c.audience()
.is_some_and(|roles| roles.contains(&Role::User))
})
.unwrap()
.as_text()
.unwrap();
// Should contain lines 1-100
assert!(text.text.contains("1: Line 1"));
assert!(text.text.contains("100: Line 100"));
assert!(!text.text.contains("101: Line 101"));
// Test viewing with explicit full range - should work
let result = router
.call_tool(
"text_editor",
json!({
"command": "view",
"path": file_path_str,
"view_range": [1, 2001]
}),
dummy_sender(),
)
.await;
assert!(result.is_ok());
temp_dir.close().unwrap();
}
#[tokio::test]
#[serial]
async fn test_text_editor_view_file_with_exactly_2000_lines() {
let router = get_router().await;
let temp_dir = tempfile::tempdir().unwrap();
let file_path = temp_dir.path().join("file_2000.txt");
let file_path_str = file_path.to_str().unwrap();
std::env::set_current_dir(&temp_dir).unwrap();
// Create a file with exactly 2000 lines (should not trigger the check)
let mut content = String::new();
for i in 1..=2000 {
content.push_str(&format!("Line {}\n", i));
}
router
.call_tool(
"text_editor",
json!({
"command": "write",
"path": file_path_str,
"file_text": content
}),
dummy_sender(),
)
.await
.unwrap();
// Test viewing without view_range - should work since it's exactly 2000 lines
let result = router
.call_tool(
"text_editor",
json!({
"command": "view",
"path": file_path_str
}),
dummy_sender(),
)
.await;
assert!(result.is_ok());
let view_result = result.unwrap();
let text = view_result
.iter()
.find(|c| {
c.audience()
.is_some_and(|roles| roles.contains(&Role::User))
})
.unwrap()
.as_text()
.unwrap();
// Should contain all lines
assert!(text.text.contains("1: Line 1"));
assert!(text.text.contains("2000: Line 2000"));
temp_dir.close().unwrap();
}
#[tokio::test]
#[serial]
async fn test_text_editor_view_small_file_without_range() {
let router = get_router().await;
let temp_dir = tempfile::tempdir().unwrap();
let file_path = temp_dir.path().join("small_file.txt");
let file_path_str = file_path.to_str().unwrap();
std::env::set_current_dir(&temp_dir).unwrap();
// Create a file with less than 2000 lines
let mut content = String::new();
for i in 1..=100 {
content.push_str(&format!("Line {}\n", i));
}
router
.call_tool(
"text_editor",
json!({
"command": "write",
"path": file_path_str,
"file_text": content
}),
dummy_sender(),
)
.await
.unwrap();
// Test viewing without view_range - should work fine
let result = router
.call_tool(
"text_editor",
json!({
"command": "view",
"path": file_path_str
}),
dummy_sender(),
)
.await;
assert!(result.is_ok());
let view_result = result.unwrap();
let text = view_result
.iter()
.find(|c| {
c.audience()
.is_some_and(|roles| roles.contains(&Role::User))
})
.unwrap()
.as_text()
.unwrap();
// Should contain all lines
assert!(text.text.contains("1: Line 1"));
assert!(text.text.contains("100: Line 100"));
temp_dir.close().unwrap();
}
#[tokio::test]
#[serial]
async fn test_bash_output_truncation() {