diff --git a/.github/workflows/tag.yaml b/.github/workflows/tag.yaml index ced2faa8c..a0d65624f 100644 --- a/.github/workflows/tag.yaml +++ b/.github/workflows/tag.yaml @@ -71,9 +71,7 @@ jobs: if any(.tags[]; test("beta")) then .tags += [ "gtstef/filebrowser:beta", - "ghcr.io/gtsteffaniak/filebrowser:beta", - "gtstef/filebrowser:latest", - "ghcr.io/gtsteffaniak/filebrowser:latest" + "ghcr.io/gtsteffaniak/filebrowser:beta" ] else . end | if any(.tags[]; test("stable")) then diff --git a/CHANGELOG.md b/CHANGELOG.md index 05ef1fc3e..d8cbc6c7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,22 @@ All notable changes to this project will be documented in this file. For commit guidelines, please refer to [Standard Version](https://github.com/conventional-changelog/standard-version). +## v1.1.2-beta + + **Notes**: + - changes to duplicate detector + - temp SQLite database each query to reduce memory pressure + - max limit is 500 groups total + - Downloads from UI requests utf-8 formatted names to support chinese and other characters. Updated swagger docs. + - beta releases no longer publish to `latest` docker tag #1675 + + **BugFixes**: + - better index status updates, fixing delays #1649 + - fixed long load times for listings with media info due sequential processing of files. + - downloaded files always included `utf-8` in filename #1671 + - custom sidebar links allow external links like `https://google.com` + - html title not populated correctly for links #1676 + ## v1.1.1-beta **Notes**: diff --git a/backend/adapters/fs/files/files.go b/backend/adapters/fs/files/files.go index 85fd40d6d..017a96651 100644 --- a/backend/adapters/fs/files/files.go +++ b/backend/adapters/fs/files/files.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strconv" "strings" + "sync" "time" "unicode" "unicode/utf8" @@ -88,38 +89,60 @@ func FileInfoFaster(opts utils.FileOptions, access *access.Storage) (*iteminfo.E startTime := time.Now() metadataCount := 0 - for i := range response.Files { - fileItem := &response.Files[i] - isItemAudio := strings.HasPrefix(fileItem.Type, "audio") - isItemVideo := strings.HasPrefix(fileItem.Type, "video") - - if isItemAudio || isItemVideo { - // Get the real path for this file - itemRealPath, _, _ := index.GetRealPath(opts.Path, fileItem.Name) - - // Extract metadata for audio files (without album art for performance) - if isItemAudio { - err := extractAudioMetadata(context.Background(), fileItem, itemRealPath, opts.AlbumArt || opts.Content, opts.Metadata) - if err != nil { - logger.Debugf("failed to extract metadata for file: "+fileItem.Name, err) - } else { - metadataCount++ - } - } else if isItemVideo { - // Extract duration for video files - err := extractVideoMetadata(context.Background(), fileItem, itemRealPath) - if err != nil { - logger.Debugf("failed to extract video metadata for file: "+fileItem.Name, err) - } else { - metadataCount++ - } + // Create a single shared FFmpegService instance for all files to coordinate concurrency + sharedFFmpegService := ffmpeg.NewFFmpegService(10, false, "") + if sharedFFmpegService != nil { + // Process files concurrently using goroutines + var wg sync.WaitGroup + var mu sync.Mutex // Protects metadataCount + + for i := range response.Files { + fileItem := &response.Files[i] + isItemAudio := strings.HasPrefix(fileItem.Type, "audio") + isItemVideo := strings.HasPrefix(fileItem.Type, "video") + + if isItemAudio || isItemVideo { + // Get the real path for this file + itemRealPath, _, _ := index.GetRealPath(opts.Path, fileItem.Name) + + // Capture loop variables in local copies to avoid closure issues + item := fileItem + itemPath := itemRealPath + isAudio := isItemAudio + + wg.Go(func() { + // Extract metadata for audio files (without album art for performance) + if isAudio { + err := extractAudioMetadata(context.Background(), item, itemPath, opts.AlbumArt || opts.Content, opts.Metadata, sharedFFmpegService) + if err != nil { + logger.Debugf("failed to extract metadata for file: "+item.Name, err) + } else { + mu.Lock() + metadataCount++ + mu.Unlock() + } + } else { + // Extract duration for video files + err := extractVideoMetadata(context.Background(), item, itemPath, sharedFFmpegService) + if err != nil { + logger.Debugf("failed to extract video metadata for file: "+item.Name, err) + } else { + mu.Lock() + metadataCount++ + mu.Unlock() + } + } + }) } } + + // Wait for all goroutines to complete + wg.Wait() } if metadataCount > 0 { elapsed := time.Since(startTime) - logger.Debugf("Extracted metadata for %d audio/video files in %v (avg: %v per file)", + logger.Debugf("Extracted metadata for %d audio/video files concurrently in %v (avg: %v per file)", metadataCount, elapsed, elapsed/time.Duration(metadataCount)) } } @@ -150,7 +173,7 @@ func processContent(info *iteminfo.ExtendedFileInfo, idx *indexing.Index, opts u extItem := &iteminfo.ExtendedItemInfo{ ItemInfo: info.ItemInfo, } - err := extractVideoMetadata(context.Background(), extItem, info.RealPath) + err := extractVideoMetadata(context.Background(), extItem, info.RealPath, nil) if err != nil { logger.Debugf("failed to extract video metadata for file: "+info.RealPath, info.Name, err) } else { @@ -177,7 +200,7 @@ func processContent(info *iteminfo.ExtendedFileInfo, idx *indexing.Index, opts u extItem := &iteminfo.ExtendedItemInfo{ ItemInfo: info.ItemInfo, } - err := extractAudioMetadata(context.Background(), extItem, info.RealPath, opts.AlbumArt || opts.Content, opts.Metadata || opts.Content) + err := extractAudioMetadata(context.Background(), extItem, info.RealPath, opts.AlbumArt || opts.Content, opts.Metadata || opts.Content, nil) if err != nil { logger.Debugf("failed to extract audio metadata for file: "+info.RealPath, info.Name, err) } else { @@ -214,7 +237,8 @@ func generateOfficeId(realPath string) string { // extractAudioMetadata extracts metadata from an audio file using dhowden/tag // and optionally extracts duration using the ffmpeg service with concurrency control -func extractAudioMetadata(ctx context.Context, item *iteminfo.ExtendedItemInfo, realPath string, getArt bool, getDuration bool) error { +// If ffmpegService is nil, a new service will be created (for backward compatibility) +func extractAudioMetadata(ctx context.Context, item *iteminfo.ExtendedItemInfo, realPath string, getArt bool, getDuration bool, ffmpegService *ffmpeg.FFmpegService) error { file, err := os.Open(realPath) if err != nil { return err @@ -253,13 +277,17 @@ func extractAudioMetadata(ctx context.Context, item *iteminfo.ExtendedItemInfo, // Extract duration ONLY if explicitly requested using the ffmpeg VideoService // This respects concurrency limits and gracefully handles missing ffmpeg if getDuration { - ffmpegService := ffmpeg.NewFFmpegService(5, false, "") - if ffmpegService != nil { + // Use provided service or create a new one for backward compatibility + service := ffmpegService + if service == nil { + service = ffmpeg.NewFFmpegService(5, false, "") + } + if service != nil { startTime := time.Now() - if duration, err := ffmpegService.GetMediaDuration(ctx, realPath); err == nil { + if duration, err := service.GetMediaDuration(ctx, realPath); err == nil { item.Metadata.Duration = int(duration) elapsed := time.Since(startTime) - if elapsed > 50*time.Millisecond { + if elapsed > 100*time.Millisecond { logger.Debugf("Duration extraction took %v for file: %s", elapsed, item.Name) } } @@ -284,11 +312,15 @@ func extractAudioMetadata(ctx context.Context, item *iteminfo.ExtendedItemInfo, } // extractVideoMetadata extracts duration from video files using ffprobe -func extractVideoMetadata(ctx context.Context, item *iteminfo.ExtendedItemInfo, realPath string) error { - // Extract duration using the ffmpeg VideoService with concurrency control - videoService := ffmpeg.NewFFmpegService(10, false, "") - if videoService != nil { - duration, err := videoService.GetMediaDuration(ctx, realPath) +// If ffmpegService is nil, a new service will be created (for backward compatibility) +func extractVideoMetadata(ctx context.Context, item *iteminfo.ExtendedItemInfo, realPath string, ffmpegService *ffmpeg.FFmpegService) error { + // Use provided service or create a new one for backward compatibility + service := ffmpegService + if service == nil { + service = ffmpeg.NewFFmpegService(10, false, "") + } + if service != nil { + duration, err := service.GetMediaDuration(ctx, realPath) if err != nil { return err } diff --git a/backend/database/sql/duplicates.go b/backend/database/sql/duplicates.go new file mode 100644 index 000000000..9157f9ae7 --- /dev/null +++ b/backend/database/sql/duplicates.go @@ -0,0 +1,121 @@ +package sql + +import ( + "database/sql" + "fmt" +) + +// FileLocation represents a file location in the index with metadata +// needed for duplicate detection operations. +type FileLocation struct { + DirPath string + FileIdx int + Name string + NormalizedName string + Extension string +} + +// CreateDuplicatesTable creates the files table and indexes needed for duplicate detection. +// This should be called once after creating a TempDB for duplicate operations. +// Indexes are created before data insertion so they're immediately available for queries. +// For small datasets (typical for duplicate detection), the performance difference is negligible. +func (t *TempDB) CreateDuplicatesTable() error { + createTableSQL := ` + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dir_path TEXT NOT NULL, + file_idx INTEGER NOT NULL, + size INTEGER NOT NULL, + name TEXT NOT NULL, + normalized_name TEXT NOT NULL, + extension TEXT NOT NULL, + UNIQUE(dir_path, file_idx) + ); + CREATE INDEX IF NOT EXISTS idx_size ON files(size); + CREATE INDEX IF NOT EXISTS idx_size_count ON files(size, normalized_name); + ` + + _, err := t.Exec(createTableSQL) + return err +} + +// InsertFileForDuplicates inserts a file entry into the duplicates table. +// This is called during the first pass through the index to stream files into the database. +func (t *TempDB) InsertFileForDuplicates(dirPath string, fileIdx int, size int64, name, normalizedName, extension string) error { + _, err := t.Exec( + "INSERT OR IGNORE INTO files (dir_path, file_idx, size, name, normalized_name, extension) VALUES (?, ?, ?, ?, ?, ?)", + dirPath, fileIdx, size, name, normalizedName, extension, + ) + return err +} + +// GetSizeGroupsForDuplicates queries for all size groups that have 2+ files. +// Returns sizes in descending order (largest first) as a slice, and a map of size -> count. +// This is used to identify potential duplicate groups before detailed comparison. +// The SQL query efficiently filters and sorts, avoiding the need to create intermediate maps. +func (t *TempDB) GetSizeGroupsForDuplicates(minSize int64) ([]int64, map[int64]int, error) { + // Query to get sizes with 2+ files, sorted by size DESC + rows, err := t.Query(` + SELECT size, COUNT(*) as count + FROM files + WHERE size >= ? + GROUP BY size + HAVING COUNT(*) >= 2 + ORDER BY size DESC + `, minSize) + if err != nil { + return nil, nil, fmt.Errorf("failed to query size groups: %w", err) + } + defer rows.Close() + + var sizes []int64 + sizeCounts := make(map[int64]int) + for rows.Next() { + var size int64 + var count int + if err := rows.Scan(&size, &count); err != nil { + return nil, nil, fmt.Errorf("failed to scan row: %w", err) + } + sizes = append(sizes, size) + sizeCounts[size] = count + } + + return sizes, sizeCounts, rows.Err() +} + +// GetFilesBySizeForDuplicates queries for all files with a specific size. +// Used for processing one size group at a time to minimize memory usage. +func (t *TempDB) GetFilesBySizeForDuplicates(size int64) ([]FileLocation, error) { + rows, err := t.Query(` + SELECT dir_path, file_idx, name, normalized_name, extension + FROM files + WHERE size = ? + ORDER BY normalized_name + `, size) + if err != nil { + return nil, fmt.Errorf("failed to query files by size: %w", err) + } + defer rows.Close() + + var locations []FileLocation + for rows.Next() { + var loc FileLocation + if err := rows.Scan(&loc.DirPath, &loc.FileIdx, &loc.Name, &loc.NormalizedName, &loc.Extension); err != nil { + return nil, fmt.Errorf("failed to scan row: %w", err) + } + locations = append(locations, loc) + } + + return locations, rows.Err() +} + +// BulkInsertFilesForDuplicates inserts multiple files in a single transaction. +// This is more efficient than calling InsertFileForDuplicates multiple times. +// The transaction must be started by the caller using BeginTransaction(). +func BulkInsertFilesForDuplicates(tx *sql.Tx, dirPath string, fileIdx int, size int64, name, normalizedName, extension string) error { + _, err := tx.Exec( + "INSERT OR IGNORE INTO files (dir_path, file_idx, size, name, normalized_name, extension) VALUES (?, ?, ?, ?, ?, ?)", + dirPath, fileIdx, size, name, normalizedName, extension, + ) + return err +} diff --git a/backend/database/sql/sqlite.go b/backend/database/sql/sqlite.go new file mode 100644 index 000000000..294aad7dd --- /dev/null +++ b/backend/database/sql/sqlite.go @@ -0,0 +1,253 @@ +package sql + +import ( + "database/sql" + "fmt" + "os" + "path/filepath" + "sync" + "time" + + "github.com/gtsteffaniak/filebrowser/backend/adapters/fs/fileutils" + "github.com/gtsteffaniak/filebrowser/backend/common/settings" + "github.com/gtsteffaniak/go-logger/logger" + _ "modernc.org/sqlite" +) + +// TempDB manages a temporary SQLite database for operations that need +// to stream large datasets without loading everything into memory. +// This can be used by any part of the codebase that needs temporary SQLite storage. +type TempDB struct { + db *sql.DB + path string + mu sync.Mutex + startTime time.Time + config *TempDBConfig +} + +// TempDBConfig holds configuration options for temporary SQLite databases. +type TempDBConfig struct { + // CacheSizeKB is the page cache size in KB. Negative values are in pages. + // For one-time databases, a smaller cache (e.g., -2000 = ~8MB) is often sufficient. + // Default: -2000 (approximately 8MB) + CacheSizeKB int + + // MmapSize is the memory-mapped I/O size in bytes. Set to 0 to disable mmap. + // For databases that fit in RAM, set this larger than the expected DB size. + // Default: 2GB (2147483648 bytes) + MmapSize int64 + + // Synchronous controls the synchronous mode. OFF is fastest but less safe. + // For temporary databases, OFF is acceptable. + // Default: OFF + Synchronous string + + // TempStore controls where temporary tables and indices are stored. + // Valid values: "FILE" (default), "MEMORY", "DEFAULT" + // Default: FILE (temporary tables stored on disk) + TempStore string + + // EnableLogging enables performance logging for debugging. + // Default: false + EnableLogging bool +} + +// mergeConfig merges the provided config with defaults, returning a new config. +// If provided config is nil or empty, returns default config. +func mergeConfig(provided *TempDBConfig) *TempDBConfig { + defaults := &TempDBConfig{ + CacheSizeKB: -2000, // ~8MB, appropriate for one-time databases + MmapSize: 2147483648, // 2GB + Synchronous: "OFF", + TempStore: "FILE", // Default to FILE, not MEMORY + EnableLogging: false, + } + + if provided == nil { + return defaults + } + + merged := *defaults // Copy defaults + + // Override with provided values if they are non-zero/non-empty + if provided.CacheSizeKB != 0 { + merged.CacheSizeKB = provided.CacheSizeKB + } + if provided.MmapSize != 0 { + merged.MmapSize = provided.MmapSize + } + if provided.Synchronous != "" { + merged.Synchronous = provided.Synchronous + } + if provided.TempStore != "" { + merged.TempStore = provided.TempStore + } + merged.EnableLogging = provided.EnableLogging + + return &merged +} + +// NewTempDB creates a new temporary SQLite database. +// The database is created in the cache directory's sql/ subdirectory. +// The database will be cleaned up on Close(). +// +// The database is optimized for bulk write-then-read operations with: +// - WAL journal mode for better concurrency +// - Configurable cache size (default: ~8MB for one-time DBs) +// - Memory-mapped I/O for faster access +// - OFF synchronous mode for maximum write performance +// - Configurable temp_store (default: FILE, can be set to MEMORY via config) +func NewTempDB(id string, config ...*TempDBConfig) (*TempDB, error) { + startTime := time.Now() + + // Merge provided config with defaults + var providedConfig *TempDBConfig + if len(config) > 0 { + providedConfig = config[0] + } + cfg := mergeConfig(providedConfig) + + // Create sql subdirectory in the cache directory + dbDir := filepath.Join(settings.Config.Server.CacheDir, "sql") + if err := os.MkdirAll(dbDir, fileutils.PermDir); err != nil { + return nil, fmt.Errorf("failed to create sql directory: %w", err) + } + + // Create temporary file in the sql subdirectory + tmpFile, err := os.CreateTemp(dbDir, fmt.Sprintf("%s.db", id)) + if err != nil { + return nil, fmt.Errorf("failed to create temp file: %w", err) + } + tmpPath := tmpFile.Name() + tmpFile.Close() + + // Open SQLite database with basic connection string + // We'll set PRAGMAs after connection for better control and logging + db, err := sql.Open("sqlite", tmpPath) + if err != nil { + os.Remove(tmpPath) + return nil, fmt.Errorf("failed to open SQLite database: %w", err) + } + + // Test the connection + if err := db.Ping(); err != nil { + db.Close() + os.Remove(tmpPath) + return nil, fmt.Errorf("failed to ping SQLite database: %w", err) + } + + // Apply optimizations via PRAGMA statements + // Execute them individually for compatibility and better error reporting + pragmaStart := time.Now() + + pragmas := []struct { + sql string + err string + }{ + {"PRAGMA journal_mode = WAL;", "failed to set WAL mode"}, + {fmt.Sprintf("PRAGMA cache_size = %d;", cfg.CacheSizeKB), "failed to set cache_size"}, + {fmt.Sprintf("PRAGMA synchronous = %s;", cfg.Synchronous), "failed to set synchronous"}, + {fmt.Sprintf("PRAGMA temp_store = %s;", cfg.TempStore), "failed to set temp_store"}, + } + + if cfg.MmapSize > 0 { + pragmas = append(pragmas, struct { + sql string + err string + }{fmt.Sprintf("PRAGMA mmap_size = %d;", cfg.MmapSize), "failed to set mmap_size"}) + } + + for _, pragma := range pragmas { + if _, err := db.Exec(pragma.sql); err != nil { + db.Close() + os.Remove(tmpPath) + return nil, fmt.Errorf("%s: %w", pragma.err, err) + } + } + + pragmaDuration := time.Since(pragmaStart) + + // Log configuration if enabled + if cfg.EnableLogging { + logger.Debugf("[TempDB:%s] Created with cache_size=%d KB, mmap_size=%d bytes, synchronous=%s, temp_store=%s (setup took %v)", + id, cfg.CacheSizeKB, cfg.MmapSize, cfg.Synchronous, cfg.TempStore, pragmaDuration) + } + + return &TempDB{ + db: db, + path: tmpPath, + startTime: startTime, + config: cfg, + }, nil +} + +// DB returns the underlying *sql.DB connection. +// This allows callers to execute custom SQL queries if needed. +func (t *TempDB) DB() *sql.DB { + return t.db +} + +// BeginTransaction starts a transaction for bulk operations. +// The caller must call Commit() or Rollback() on the returned transaction. +// The mutex is NOT held during the transaction - caller is responsible for coordination. +func (t *TempDB) BeginTransaction() (*sql.Tx, error) { + return t.db.Begin() +} + +// Exec executes a SQL statement that doesn't return rows. +// This is a convenience method that handles locking. +func (t *TempDB) Exec(query string, args ...interface{}) (sql.Result, error) { + t.mu.Lock() + defer t.mu.Unlock() + return t.db.Exec(query, args...) +} + +// Query executes a query that returns rows. +// This is a convenience method that handles locking. +func (t *TempDB) Query(query string, args ...interface{}) (*sql.Rows, error) { + t.mu.Lock() + defer t.mu.Unlock() + return t.db.Query(query, args...) +} + +// QueryRow executes a query that is expected to return at most one row. +// This is a convenience method that handles locking. +func (t *TempDB) QueryRow(query string, args ...interface{}) *sql.Row { + t.mu.Lock() + defer t.mu.Unlock() + return t.db.QueryRow(query, args...) +} + +// Close closes the database connection and removes the temporary file. +// This should always be called when done with the database, typically in a defer statement. +// If logging is enabled, it will log the total lifetime and file size for performance analysis. +func (t *TempDB) Close() error { + t.mu.Lock() + defer t.mu.Unlock() + + if t.config != nil && t.config.EnableLogging { + totalDuration := time.Since(t.startTime) + fileInfo, err := os.Stat(t.path) + var fileSize int64 + if err == nil { + fileSize = fileInfo.Size() + } + logger.Debugf("[TempDB] Closed after %v, final size: %d bytes (%.2f MB)", + totalDuration, fileSize, float64(fileSize)/(1024*1024)) + } + + if t.db != nil { + if err := t.db.Close(); err != nil { + os.Remove(t.path) + return err + } + } + + return os.Remove(t.path) +} + +// Path returns the path to the temporary database file. +// This is useful for debugging or if you need to inspect the database. +func (t *TempDB) Path() string { + return t.path +} diff --git a/backend/events/eventRouter.go b/backend/events/eventRouter.go index 5c60e2b49..6486c981d 100644 --- a/backend/events/eventRouter.go +++ b/backend/events/eventRouter.go @@ -109,13 +109,20 @@ func SendToUsers(eventType, message string, users []string) { } func SendSourceUpdate(source string, message string) { - sourceUpdateChan <- sourceEvent{ + event := sourceEvent{ source: source, event: EventMessage{ EventType: "sourceUpdate", Message: message, }, } + select { + case sourceUpdateChan <- event: + // Event sent successfully + default: + // Channel is full, log warning but don't block + // This shouldn't happen under normal circumstances + } } func DebouncedBroadcast(eventType, message string) { @@ -129,15 +136,27 @@ func handleSourceUpdates() { for update := range sourceUpdateChan { sourceClientsMu.RLock() clients := sourceClients[update.source] + clientCount := len(clients) sourceClientsMu.RUnlock() + if clientCount == 0 { + // No clients registered for this source - this is normal if no one is connected + continue + } + + sentCount := 0 for ch := range clients { select { case ch <- update.event: + sentCount++ default: - // Optional: log dropped message + // Channel full, message dropped } } + // Log if we have clients but couldn't send to all + //if sentCount < clientCount { + // // Some messages were dropped due to full channels + //} } } diff --git a/backend/go.mod b/backend/go.mod index 2d08a1f18..51b05d3ac 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -27,6 +27,7 @@ require ( golang.org/x/sys v0.38.0 golang.org/x/time v0.14.0 gopkg.in/yaml.v3 v3.0.1 + modernc.org/sqlite v1.40.1 ) require ( @@ -91,6 +92,7 @@ require ( github.com/dlclark/regexp2 v1.11.5 // indirect github.com/dsoprea/go-logging v0.0.0-20200710184922-b02d349568dd // indirect github.com/dsoprea/go-utility/v2 v2.0.0-20221003172846-a3e1774ef349 // indirect + github.com/dustin/go-humanize v1.0.1 // indirect github.com/ebitengine/purego v0.9.1 // indirect github.com/ettle/strcase v0.2.0 // indirect github.com/fatih/color v1.18.0 // indirect @@ -141,6 +143,7 @@ require ( github.com/golangci/revgrep v0.8.0 // indirect github.com/golangci/swaggoswag v0.0.0-20250504205917-77f2aca3143e // indirect github.com/golangci/unconvert v0.0.0-20250410112200-a129a6e6413e // indirect + github.com/google/uuid v1.6.0 // indirect github.com/gordonklaus/ineffassign v0.2.0 // indirect github.com/gostaticanalysis/analysisutil v0.7.1 // indirect github.com/gostaticanalysis/comment v1.5.0 // indirect @@ -190,6 +193,7 @@ require ( github.com/moricho/tparallel v0.3.2 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/nakabonne/nestif v0.3.1 // indirect + github.com/ncruces/go-strftime v0.1.9 // indirect github.com/nishanths/exhaustive v0.12.0 // indirect github.com/nishanths/predeclared v0.2.2 // indirect github.com/nunnatsa/ginkgolinter v0.21.2 // indirect @@ -207,6 +211,7 @@ require ( github.com/quasilyte/regex/syntax v0.0.0-20210819130434-b3f0c404a727 // indirect github.com/quasilyte/stdinfo v0.0.0-20220114132959-f7386bf02567 // indirect github.com/raeperd/recvcheck v0.2.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect @@ -258,6 +263,7 @@ require ( go.uber.org/multierr v1.10.0 // indirect go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b // indirect golang.org/x/exp/typeparams v0.0.0-20251023183803-a4bb9ffd2546 // indirect golang.org/x/net v0.47.0 // indirect golang.org/x/sync v0.18.0 // indirect @@ -267,6 +273,9 @@ require ( gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect honnef.co/go/tools v0.6.1 // indirect + modernc.org/libc v1.66.10 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect mvdan.cc/gofumpt v0.9.2 // indirect mvdan.cc/unparam v0.0.0-20251027182757-5beb8c8f8f15 // indirect sigs.k8s.io/yaml v1.3.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index ada8adc26..01124f7bb 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -236,6 +236,8 @@ github.com/dsoprea/go-utility/v2 v2.0.0-20221003142440-7a1927d49d9d/go.mod h1:LV github.com/dsoprea/go-utility/v2 v2.0.0-20221003160719-7bc88537c05e/go.mod h1:VZ7cB0pTjm1ADBWhJUOHESu4ZYy9JN+ZPqjfiW09EPU= github.com/dsoprea/go-utility/v2 v2.0.0-20221003172846-a3e1774ef349 h1:DilThiXje0z+3UQ5YjYiSRRzVdtamFpvBQXKwMglWqw= github.com/dsoprea/go-utility/v2 v2.0.0-20221003172846-a3e1774ef349/go.mod h1:4GC5sXji84i/p+irqghpPFZBF8tRN/Q7+700G0/DLe8= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= @@ -468,6 +470,8 @@ github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hf github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 h1:EEHtgt9IwisQ2AZ4pIsMjahcegHh6rmhqxzIRQIyepY= github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/gordonklaus/ineffassign v0.2.0 h1:Uths4KnmwxNJNzq87fwQQDDnbNb7De00VOk9Nu0TySs= @@ -635,6 +639,8 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nakabonne/nestif v0.3.1 h1:wm28nZjhQY5HyYPx+weN3Q65k6ilSBxDb8v5S81B81U= github.com/nakabonne/nestif v0.3.1/go.mod h1:9EtoZochLn5iUprVDmDjqGKPofoUEBL8U4Ngq6aY7OE= +github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= +github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/niklasfasching/go-org v1.9.1 h1:/3s4uTPOF06pImGa2Yvlp24yKXZoTYM+nsIlMzfpg/0= github.com/niklasfasching/go-org v1.9.1/go.mod h1:ZAGFFkWvUQcpazmi/8nHqwvARpr1xpb+Es67oUGX/48= github.com/nishanths/exhaustive v0.12.0 h1:vIY9sALmw6T/yxiASewa4TQcFsVYZQQRUQJhKRf3Swg= @@ -718,6 +724,8 @@ github.com/quasilyte/stdinfo v0.0.0-20220114132959-f7386bf02567 h1:M8mH9eK4OUR4l github.com/quasilyte/stdinfo v0.0.0-20220114132959-f7386bf02567/go.mod h1:DWNGW8A4Y+GyBgPuaQJuWiy0XYftx4Xm/y5Jqk9I6VQ= github.com/raeperd/recvcheck v0.2.0 h1:GnU+NsbiCqdC2XX5+vMZzP+jAJC5fht7rcVTAhX74UI= github.com/raeperd/recvcheck v0.2.0/go.mod h1:n04eYkwIR0JbgD73wT8wL4JjPC3wm0nFtzBnWNocnYU= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= @@ -1256,6 +1264,32 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.6.1 h1:R094WgE8K4JirYjBaOpz/AvTyUu/3wbmAoskKN/pxTI= honnef.co/go/tools v0.6.1/go.mod h1:3puzxxljPCe8RGJX7BIy1plGbxEOZni5mR2aXe3/uk4= +modernc.org/cc/v4 v4.26.5 h1:xM3bX7Mve6G8K8b+T11ReenJOT+BmVqQj0FY5T4+5Y4= +modernc.org/cc/v4 v4.26.5/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.28.1 h1:wPKYn5EC/mYTqBO373jKjvX2n+3+aK7+sICCv4Fjy1A= +modernc.org/ccgo/v4 v4.28.1/go.mod h1:uD+4RnfrVgE6ec9NGguUNdhqzNIeeomeXf6CL0GTE5Q= +modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= +modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.66.10 h1:yZkb3YeLx4oynyR+iUsXsybsX4Ubx7MQlSYEw4yj59A= +modernc.org/libc v1.66.10/go.mod h1:8vGSEwvoUoltr4dlywvHqjtAqHBaw0j1jI7iFBTAr2I= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.40.1 h1:VfuXcxcUWWKRBuP8+BR9L7VnmusMgBNNnBYGEe9w/iY= +modernc.org/sqlite v1.40.1/go.mod h1:9fjQZ0mB1LLP0GYrp39oOJXx/I2sxEnZtzCmEQIKvGE= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= mvdan.cc/gofumpt v0.9.2 h1:zsEMWL8SVKGHNztrx6uZrXdp7AX8r421Vvp23sz7ik4= mvdan.cc/gofumpt v0.9.2/go.mod h1:iB7Hn+ai8lPvofHd9ZFGVg2GOr8sBUw1QUWjNbmIL/s= mvdan.cc/unparam v0.0.0-20251027182757-5beb8c8f8f15 h1:ssMzja7PDPJV8FStj7hq9IKiuiKhgz9ErWw+m68e7DI= diff --git a/backend/http/duplicates.go b/backend/http/duplicates.go index 9101b7363..6df5d02f7 100644 --- a/backend/http/duplicates.go +++ b/backend/http/duplicates.go @@ -8,30 +8,29 @@ import ( "net/url" "os" "path/filepath" - "sort" "strings" "sync" "time" "unicode" "github.com/gtsteffaniak/filebrowser/backend/common/settings" + "github.com/gtsteffaniak/filebrowser/backend/database/sql" "github.com/gtsteffaniak/filebrowser/backend/indexing" "github.com/gtsteffaniak/go-cache/cache" + "github.com/gtsteffaniak/go-logger/logger" ) // duplicateSearchMutex serializes duplicate searches to run one at a time // This is separate from the index's scanMutex to avoid conflicts with indexing var duplicateSearchMutex sync.Mutex +const maxGroups = 500 // Limit total duplicate groups + // duplicateResultsCache caches duplicate search results for 15 seconds var duplicateResultsCache = cache.NewCache[[]duplicateGroup](15 * time.Second) -// fileLocation is a minimal reference to a file in the index -// Used during duplicate detection to avoid allocating full SearchResult objects -type fileLocation struct { - dirPath string // path to directory in index - fileIdx int // index in directory's Files slice -} +// checksumCache caches file checksums for 1 hour, keyed by source/path/modtime +var checksumCache = cache.NewCache[string](1 * time.Hour) type duplicateGroup struct { Size int64 `json:"size"` @@ -110,15 +109,55 @@ func duplicatesHandler(w http.ResponseWriter, r *http.Request, d *requestContext return renderJSON(w, r, duplicateGroups) } -// findDuplicatesInIndex finds duplicates by working directly with index structures -// This minimizes memory allocation by only creating SearchResult objects for final results +// findDuplicatesInIndex finds duplicates using SQLite streaming approach +// This minimizes memory allocation by: +// 1. Streaming files into a temporary SQLite database (no in-memory map) +// 2. Querying SQLite for size groups with 2+ files +// 3. Processing each size group sequentially (only one in memory at a time) +// 4. Only creating SearchResult objects for final verified duplicates func findDuplicatesInIndex(index *indexing.Index, opts *duplicatesOptions) []duplicateGroup { - const maxTotalFiles = 1000 // Limit total files across all groups - // Step 1: Group files by size, working directly with index structures - // Key: size, Value: list of (dirPath, fileIndex) minimal references - sizeGroups := make(map[int64][]fileLocation) + // Create temporary SQLite database for streaming in cache directory + tempDB, err := sql.NewTempDB("duplicates", &sql.TempDBConfig{ + CacheSizeKB: -2000, // ~8MB, sufficient for small-medium datasets + MmapSize: 104857600, // 100MB - reasonable limit + Synchronous: "OFF", // Maximum write performance for temp DB + EnableLogging: false, + }) + if err != nil { + // Return empty results if SQLite fails + return []duplicateGroup{} + } + defer tempDB.Close() + + // Create the duplicates table with indexes + // Indexes are created before insert so they're immediately available for queries + tableStart := time.Now() + if err = tempDB.CreateDuplicatesTable(); err != nil { + return []duplicateGroup{} + } + if time.Since(tableStart) > 10*time.Millisecond { + logger.Debugf("[Duplicates] Table and index creation took %v", time.Since(tableStart)) + } + + // Step 1: Stream files into SQLite database + // This avoids building a huge map in memory + tx, err := tempDB.BeginTransaction() + if err != nil { + return []duplicateGroup{} + } + + // Prepare statement for bulk inserts + stmt, err := tx.Prepare("INSERT OR IGNORE INTO files (dir_path, file_idx, size, name, normalized_name, extension) VALUES (?, ?, ?, ?, ?, ?)") + if err != nil { + if err2 := tx.Rollback(); err2 != nil { + logger.Errorf("[Duplicates] Failed to rollback transaction: %v", err2) + } + return []duplicateGroup{} + } + defer stmt.Close() + insertErr := error(nil) index.ReadOnlyOperation(func() { for dirPath, dir := range index.GetDirectories() { // Skip directories not in scope @@ -129,117 +168,137 @@ func findDuplicatesInIndex(index *indexing.Index, opts *duplicatesOptions) []dup for i := range dir.Files { file := &dir.Files[i] if file.Size >= opts.minSize { - sizeGroups[file.Size] = append(sizeGroups[file.Size], fileLocation{dirPath, i}) + // Normalize filename for efficient matching + normalizedName := normalizeFilename(file.Name) + extension := strings.ToLower(filepath.Ext(file.Name)) + + // Insert into SQLite (will be committed in batch) + _, err = stmt.Exec(dirPath, i, file.Size, file.Name, normalizedName, extension) + if err != nil && insertErr == nil { + insertErr = err + } } } } }) - // Step 2: Process each size group to find duplicates using filename matching (fast) - // We'll verify with checksums later on the final groups - candidateGroups := []struct { - size int64 - locations []fileLocation - }{} - totalFiles := 0 - - for size, locations := range sizeGroups { - if len(locations) < 2 { - continue // Skip files with no potential duplicates + // Commit the transaction + if insertErr != nil || tx.Commit() != nil { + return []duplicateGroup{} + } + + // Step 2: Query SQLite for size groups with 2+ files (already sorted by SQL) + sizes, _, err := tempDB.GetSizeGroupsForDuplicates(opts.minSize) + if err != nil { + return []duplicateGroup{} + } + // Step 3: Process each size group sequentially to minimize memory + duplicateGroups := []duplicateGroup{} + totalFileQueryTime := time.Duration(0) + fileQueryCount := 0 + + for _, size := range sizes { + // Stop if we've hit the group limit + if len(duplicateGroups) >= maxGroups { + break + } + + // Get files for this size from SQLite + fileQueryStart := time.Now() + locations, err := tempDB.GetFilesBySizeForDuplicates(size) + fileQueryDuration := time.Since(fileQueryStart) + totalFileQueryTime += fileQueryDuration + fileQueryCount++ + + if err != nil || len(locations) < 2 { + continue } // Use filename matching for initial grouping (fast) - groups := groupLocationsByFilename(locations, index, size) + // Work directly with sql.FileLocation - no conversion needed + groups := groupLocationsByFilenameWithMetadata(locations, index, size) - // Collect candidate groups up to the limit + // Process candidate groups up to the limit for _, locGroup := range groups { if len(locGroup) < 2 { continue } - // Check if adding this group would exceed the limit - if totalFiles+len(locGroup) > maxTotalFiles { + // Stop if we've hit the group limit + if len(duplicateGroups) >= maxGroups { break } - candidateGroups = append(candidateGroups, struct { - size int64 - locations []fileLocation - }{size, locGroup}) - totalFiles += len(locGroup) - } - - // Stop processing more size groups if we've hit the limit - if totalFiles >= maxTotalFiles { - break - } - } + // Verify with checksums + verifiedGroups := groupLocationsByChecksumFromSQL(locGroup, index, size) - // Step 3: Verify all final groups with checksums - duplicateGroups := []duplicateGroup{} - for _, candidate := range candidateGroups { - // Verify with checksums - verifiedGroups := groupLocationsByChecksum(candidate.locations, index, candidate.size) + // Create SearchResult objects only for verified duplicates + for _, verifiedGroup := range verifiedGroups { + if len(verifiedGroup) < 2 { + continue + } - // Create SearchResult objects only for verified duplicates - for _, locGroup := range verifiedGroups { - if len(locGroup) < 2 { - continue - } + resultGroup := make([]*indexing.SearchResult, 0, len(verifiedGroup)) + index.ReadOnlyOperation(func() { + dirs := index.GetDirectories() + for _, loc := range verifiedGroup { + dir := dirs[loc.DirPath] + if dir == nil { + continue + } + if loc.FileIdx >= len(dir.Files) { + continue + } + file := &dir.Files[loc.FileIdx] - resultGroup := make([]*indexing.SearchResult, 0, len(locGroup)) - index.ReadOnlyOperation(func() { - dirs := index.GetDirectories() - for _, loc := range locGroup { - dir := dirs[loc.dirPath] - if dir == nil { - continue - } - if loc.fileIdx >= len(dir.Files) { - continue - } - file := &dir.Files[loc.fileIdx] + // CRITICAL: Verify size matches the expected size for this group + if file.Size != size { + continue + } - // CRITICAL: Verify size matches the expected size for this group - // Index could have changed between initial grouping and now - if file.Size != candidate.size { - continue - } + // Construct full path + fullPath := filepath.Join(loc.DirPath, file.Name) - // Construct full path - fullPath := filepath.Join(loc.dirPath, file.Name) + // Remove the user scope from path + adjustedPath := strings.TrimPrefix(fullPath, opts.combinedPath) + if adjustedPath == "" { + adjustedPath = "/" + } - // Remove the user scope from path - adjustedPath := strings.TrimPrefix(fullPath, opts.combinedPath) - if adjustedPath == "" { - adjustedPath = "/" + resultGroup = append(resultGroup, &indexing.SearchResult{ + Path: adjustedPath, + Type: file.Type, + Size: file.Size, + Modified: file.ModTime.Format(time.RFC3339), + HasPreview: file.HasPreview, + }) } + }) - resultGroup = append(resultGroup, &indexing.SearchResult{ - Path: adjustedPath, - Type: file.Type, - Size: file.Size, - Modified: file.ModTime.Format(time.RFC3339), - HasPreview: file.HasPreview, + if len(resultGroup) >= 2 { + duplicateGroups = append(duplicateGroups, duplicateGroup{ + Size: size, + Count: len(resultGroup), + Files: resultGroup, }) } - }) + } - if len(resultGroup) >= 2 { - duplicateGroups = append(duplicateGroups, duplicateGroup{ - Size: candidate.size, - Count: len(resultGroup), - Files: resultGroup, - }) + // Stop if we've hit the group limit + if len(duplicateGroups) >= maxGroups { + break } } } - // Sort groups by size (largest to smallest) - sort.Slice(duplicateGroups, func(i, j int) bool { - return duplicateGroups[i].Size > duplicateGroups[j].Size - }) + // Log aggregate query performance + if fileQueryCount > 0 { + avgFileQueryTime := totalFileQueryTime / time.Duration(fileQueryCount) + logger.Debugf("[Duplicates] File-by-size queries: %d queries, total %v, avg %v per query", + fileQueryCount, totalFileQueryTime, avgFileQueryTime) + } + // Groups are already sorted by size (largest to smallest) from SQL query return duplicateGroups } @@ -289,27 +348,29 @@ func prepDuplicatesOptions(r *http.Request, d *requestContext) (*duplicatesOptio }, nil } -// groupLocationsByChecksum groups file locations by partial checksum -// Works with minimal fileLocation references instead of full SearchResult objects -func groupLocationsByChecksum(locations []fileLocation, index *indexing.Index, fileSize int64) [][]fileLocation { - checksumMap := make(map[string][]fileLocation) +// groupLocationsByChecksumFromSQL groups SQLite file locations by partial checksum +// Works directly with sql.FileLocation instead of converting to fileLocation +func groupLocationsByChecksumFromSQL(locations []sql.FileLocation, index *indexing.Index, fileSize int64) [][]sql.FileLocation { + checksumMap := make(map[string][]sql.FileLocation) // Build checksum groups for _, loc := range locations { // Construct filesystem path for checksum computation - // index.Path is the absolute filesystem root, loc.dirPath is index-relative - fullPath := filepath.Join(index.Path, loc.dirPath) + // index.Path is the absolute filesystem root, loc.DirPath is index-relative + fullPath := filepath.Join(index.Path, loc.DirPath) - // Get the filename from the index and verify size still matches + // Get the filename and modtime from the index and verify size still matches var fileName string + var fileModTime time.Time var sizeMatches bool index.ReadOnlyOperation(func() { - if dir := index.GetDirectories()[loc.dirPath]; dir != nil { - if loc.fileIdx < len(dir.Files) { - file := &dir.Files[loc.fileIdx] + if dir := index.GetDirectories()[loc.DirPath]; dir != nil { + if loc.FileIdx < len(dir.Files) { + file := &dir.Files[loc.FileIdx] // CRITICAL: Verify size matches expected size for this group if file.Size == fileSize { fileName = file.Name + fileModTime = file.ModTime sizeMatches = true } } @@ -321,7 +382,7 @@ func groupLocationsByChecksum(locations []fileLocation, index *indexing.Index, f } filePath := filepath.Join(fullPath, fileName) - checksum, err := computePartialChecksum(filePath, fileSize) + checksum, err := computePartialChecksum(index.Path, filePath, fileSize, fileModTime) if err != nil { continue } @@ -329,7 +390,7 @@ func groupLocationsByChecksum(locations []fileLocation, index *indexing.Index, f } // Convert map to slice of groups - groups := make([][]fileLocation, 0, len(checksumMap)) + groups := make([][]sql.FileLocation, 0, len(checksumMap)) for _, group := range checksumMap { if len(group) >= 2 { groups = append(groups, group) @@ -345,8 +406,17 @@ func groupLocationsByChecksum(locations []fileLocation, index *indexing.Index, f // - Always read first 8KB (header/metadata) // - For files > 24KB: sample middle 8KB and last 8KB // - Total read: ~24KB max per file regardless of file size -func computePartialChecksum(path string, size int64) (string, error) { - file, err := os.Open(path) +// Checksums are cached for 1 hour based on source/path/modtime +func computePartialChecksum(sourcePath, filePath string, size int64, modTime time.Time) (string, error) { + // Generate cache key from source path, file path, and modification time + cacheKey := fmt.Sprintf("%s:%s:%d:%d", sourcePath, filePath, size, modTime.Unix()) + + // Check cache first + if cachedChecksum, ok := checksumCache.Get(cacheKey); ok { + return cachedChecksum, nil + } + + file, err := os.Open(filePath) if err != nil { return "", err } @@ -387,17 +457,22 @@ func computePartialChecksum(path string, size int64) (string, error) { } } - return fmt.Sprintf("%x", hash.Sum(nil)), nil + checksum := fmt.Sprintf("%x", hash.Sum(nil)) + + // Cache the result + checksumCache.Set(cacheKey, checksum) + + return checksum, nil } -// groupLocationsByFilename groups file locations by fuzzy filename matching -// Works with minimal fileLocation references instead of full SearchResult objects -func groupLocationsByFilename(locations []fileLocation, index *indexing.Index, expectedSize int64) [][]fileLocation { +// groupLocationsByFilenameWithMetadata uses pre-computed normalized names from SQLite +// Works directly with sql.FileLocation - no conversion needed +func groupLocationsByFilenameWithMetadata(locations []sql.FileLocation, index *indexing.Index, expectedSize int64) [][]sql.FileLocation { if len(locations) == 0 { return nil } - // First, fetch all file metadata we need for comparison in a single lock + // Verify sizes from index (still need to check index is up to date) type fileMetadata struct { name string size int64 @@ -407,11 +482,9 @@ func groupLocationsByFilename(locations []fileLocation, index *indexing.Index, e index.ReadOnlyOperation(func() { dirs := index.GetDirectories() for i, loc := range locations { - if dir := dirs[loc.dirPath]; dir != nil { - if loc.fileIdx < len(dir.Files) { - file := &dir.Files[loc.fileIdx] - // CRITICAL: Only include files that match the expected size - // Index could have changed since locations were collected + if dir := dirs[loc.DirPath]; dir != nil { + if loc.FileIdx < len(dir.Files) { + file := &dir.Files[loc.FileIdx] if file.Size == expectedSize { metadata[i] = fileMetadata{ name: file.Name, @@ -423,8 +496,7 @@ func groupLocationsByFilename(locations []fileLocation, index *indexing.Index, e } }) - // Now group by fuzzy filename matching without holding the lock - groups := [][]fileLocation{} + groups := [][]sql.FileLocation{} used := make(map[int]bool) for i := 0; i < len(locations); i++ { @@ -432,33 +504,29 @@ func groupLocationsByFilename(locations []fileLocation, index *indexing.Index, e continue } - group := []fileLocation{locations[i]} + group := []sql.FileLocation{locations[i]} used[i] = true baseSize := metadata[i].size - baseName1 := metadata[i].name - ext1 := strings.ToLower(filepath.Ext(baseName1)) - filename1 := normalizeFilename(baseName1) + // Use pre-computed normalized name and extension from SQLite + filename1 := locations[i].NormalizedName + ext1 := locations[i].Extension for j := i + 1; j < len(locations); j++ { if used[j] || metadata[j].name == "" { continue } - // CRITICAL: Ensure exact size match if metadata[j].size != baseSize { continue } - baseName2 := metadata[j].name - ext2 := strings.ToLower(filepath.Ext(baseName2)) - - // CRITICAL: Extensions must match exactly (case-insensitive) + ext2 := locations[j].Extension if ext1 != ext2 { continue } - filename2 := normalizeFilename(baseName2) + filename2 := locations[j].NormalizedName // Check if filenames are similar enough if filenamesSimilar(filename1, filename2) { diff --git a/backend/http/jobs.go b/backend/http/jobs.go index adba8a7f2..7dc06f8b4 100644 --- a/backend/http/jobs.go +++ b/backend/http/jobs.go @@ -23,7 +23,7 @@ func getJobsHandler(w http.ResponseWriter, r *http.Request, d *requestContext) ( sources := settings.GetSources(d.user) reducedIndexes := map[string]indexing.ReducedIndex{} for _, source := range sources { - reducedIndex, err := indexing.GetIndexInfo(source) + reducedIndex, err := indexing.GetIndexInfo(source, false) if err != nil { logger.Debugf("error getting index info: %v", err) continue diff --git a/backend/http/raw.go b/backend/http/raw.go index 148d1cfe3..6c488eee1 100644 --- a/backend/http/raw.go +++ b/backend/http/raw.go @@ -58,21 +58,45 @@ func (r *throttledReadSeeker) Seek(offset int64, whence int) (int64, error) { return r.rs.Seek(offset, whence) } +// toASCIIFilename converts a filename to ASCII-safe format by replacing non-ASCII characters with underscores +func toASCIIFilename(fileName string) string { + var result strings.Builder + for _, r := range fileName { + if r > 127 { + // Replace non-ASCII characters with underscore + result.WriteRune('_') + } else { + result.WriteRune(r) + } + } + return result.String() +} + func setContentDisposition(w http.ResponseWriter, r *http.Request, fileName string) { + dispositionType := "attachment" if r.URL.Query().Get("inline") == "true" { - w.Header().Set("Content-Disposition", "inline; filename*=utf-8''"+url.PathEscape(fileName)) - } else { - // As per RFC6266 section 4.3 - w.Header().Set("Content-Disposition", "attachment; filename*=utf-8''"+url.PathEscape(fileName)) + dispositionType = "inline" } + + // standard: ASCII-only safe fallback + asciiFileName := toASCIIFilename(fileName) + // RFC 5987: UTF-8 encoded + encodedFileName := url.PathEscape(fileName) + + // Always set both filename (ASCII) and filename* (UTF-8) for maximum compatibility (RFC 6266) + w.Header().Set("Content-Disposition", fmt.Sprintf("%s; filename=%q; filename*=utf-8''%s", dispositionType, asciiFileName, encodedFileName)) } // rawHandler serves the raw content of a file, multiple files, or directory in various formats. // @Summary Get raw content of a file, multiple files, or directory // @Description Returns the raw content of a file, multiple files, or a directory. Supports downloading files as archives in various formats. +// @Description +// @Description **Filename Encoding:** +// @Description - The Content-Disposition header will always include both: +// @Description 1. `filename="..."`: An ASCII-safe version of the filename for compatibility. +// @Description 2. `filename*=utf-8”...`: The full UTF-8 encoded filename (RFC 6266/5987) for modern clients. // @Tags Resources // @Accept json -// @Produce json // @Param files query string true "a list of files in the following format 'source::filename' and separated by '||' with additional items in the list. (required)" // @Param inline query bool false "If true, sets 'Content-Disposition' to 'inline'. Otherwise, defaults to 'attachment'." // @Param algo query string false "Compression algorithm for archiving multiple files or directories. Options: 'zip' and 'tar.gz'. Default is 'zip'." @@ -395,6 +419,7 @@ func rawFilesHandler(w http.ResponseWriter, r *http.Request, d *requestContext, } // serve content allows for range requests. // video scrubbing, etc. + // Note: http.ServeContent will respect our already-set Content-Disposition header var reader io.ReadSeeker = fd if d.share != nil && d.share.MaxBandwidth > 0 { // convert KB/s to B/s @@ -432,7 +457,8 @@ func rawFilesHandler(w http.ResponseWriter, r *http.Request, d *requestContext, if len(fileList) == 1 && isDir { baseDirName = filepath.Base(realPath) } - fileName = url.PathEscape(baseDirName + extension) + // Store original filename before any encoding + originalFileName := baseDirName + extension archiveData := filepath.Join(config.Server.CacheDir, utils.InsecureRandomIdentifier(10)) if extension == ".zip" { @@ -462,11 +488,12 @@ func rawFilesHandler(w http.ResponseWriter, r *http.Request, d *requestContext, sizeInMB := fileInfo.Size() / 1024 / 1024 if sizeInMB > 500 { - logger.Debugf("User %v is downloading large (%d MB) file: %v", d.user.Username, sizeInMB, fileName) + logger.Debugf("User %v is downloading large (%d MB) file: %v", d.user.Username, sizeInMB, originalFileName) } // Set headers AFTER computing actual archive size - w.Header().Set("Content-Disposition", "attachment; filename*=utf-8''"+fileName) + // Use the same setContentDisposition logic for archives + setContentDisposition(w, r, originalFileName) w.Header().Set("Content-Length", fmt.Sprintf("%d", fileInfo.Size())) w.Header().Set("Content-Type", "application/octet-stream") diff --git a/backend/indexing/indexingFiles.go b/backend/indexing/indexingFiles.go index 542af9862..c744c5e0f 100644 --- a/backend/indexing/indexingFiles.go +++ b/backend/indexing/indexingFiles.go @@ -69,6 +69,7 @@ type Index struct { FoundHardLinks map[string]uint64 `json:"-"` // hardlink path -> size processedInodes map[uint64]struct{} `json:"-"` totalSize uint64 `json:"-"` + previousTotalSize uint64 `json:"-"` // Track previous totalSize for change detection // Scanner management (new multi-scanner system) scanners map[string]*Scanner `json:"-"` // path -> scanner @@ -234,7 +235,8 @@ func (idx *Index) GetFsDirInfo(adjustedPath string) (*iteminfo.FileInfo, error) if !dirInfo.IsDir() { // Use handleFile for consistent size calculation across platforms - realSize, _ := idx.handleFile(dirInfo, adjustedPath, realPath) + // API calls (GetFsDirInfo) should not update totalSize, so pass false for isRoutineScan + realSize, _ := idx.handleFile(dirInfo, adjustedPath, realPath, false) size := int64(realSize) fileInfo := iteminfo.FileInfo{ @@ -246,16 +248,10 @@ func (idx *Index) GetFsDirInfo(adjustedPath string) (*iteminfo.FileInfo, error) }, } fileInfo.DetectType(realPath, false) - - // Set HasPreview flags using consolidated helper setFilePreviewFlags(&fileInfo.ItemInfo, realPath) - return &fileInfo, nil } - - // Normalize directory path to always have trailing slash adjustedPath = utils.AddTrailingSlashIfNotExists(adjustedPath) - // adjustedPath is already normalized with trailing slash combinedPath := adjustedPath var response *iteminfo.FileInfo response, err = idx.GetDirInfo(dir, dirInfo, realPath, adjustedPath, combinedPath, actionConfig{ @@ -272,7 +268,6 @@ func (idx *Index) GetFsDirInfo(adjustedPath string) (*iteminfo.FileInfo, error) found := false for _, item := range response.Files { if item.Name == baseName { - // Clean path to remove trailing slashes before joining filePath := strings.TrimSuffix(adjustedPath, "/") + "/" + item.Name response = &iteminfo.FileInfo{ Path: filePath, @@ -292,9 +287,7 @@ func (idx *Index) GetFsDirInfo(adjustedPath string) (*iteminfo.FileInfo, error) } func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjustedPath, combinedPath string, config actionConfig) (*iteminfo.FileInfo, error) { - // Ensure combinedPath has exactly one trailing slash to prevent double slashes in subdirectory paths - combinedPath = strings.TrimRight(combinedPath, "/") + "/" - // Read directory contents + combinedPath = utils.AddTrailingSlashIfNotExists(combinedPath) files, err := dirInfo.Readdir(-1) if err != nil { return nil, err @@ -309,8 +302,6 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus hasPreview = realDirInfo.HasPreview } } - - // Process each file and directory in the current directory for _, file := range files { hidden := isHidden(file, idx.Path+combinedPath) isDir := iteminfo.IsDirectory(file) @@ -321,14 +312,11 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus continue } } - // Skip logic based on mode if config.CheckViewable { - // When checking viewable items: skip if shouldSkip=true AND not viewable if idx.shouldSkip(isDir, hidden, fullCombined, baseName, config) && !idx.IsViewable(isDir, fullCombined) { continue } } else { - // Normal indexing mode: skip if shouldSkip=true if idx.shouldSkip(isDir, hidden, fullCombined, baseName, config) { continue } @@ -341,7 +329,6 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus if isDir { dirPath := combinedPath + file.Name() - // Check NeverWatchPaths map - O(1) lookup for paths with neverWatch: true if idx.wasIndexed && config.Recursive && idx.Config.ResolvedConditionals != nil { if _, exists := idx.Config.ResolvedConditionals.NeverWatchPaths[fullCombined]; exists { realDirInfo, exists := idx.GetMetadataInfo(dirPath, true) @@ -355,11 +342,9 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus if omitList[file.Name()] { continue } - if config.Recursive { // clear for garbage collection file = nil - // Recursively index the subdirectory err = idx.indexDirectory(dirPath, config) if err != nil { logger.Errorf("Failed to index directory %s: %v", dirPath, err) @@ -376,17 +361,14 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus dirInfos = append(dirInfos, *itemInfo) if config.IsRoutineScan { idx.NumDirs++ - // Also update the active scanner's counter idx.incrementScannerDirs() } } else { realFilePath := realPath + "/" + file.Name() - size, shouldCountSize := idx.handleFile(file, fullCombined, realFilePath) + size, shouldCountSize := idx.handleFile(file, fullCombined, realFilePath, config.IsRoutineScan) itemInfo.DetectType(realFilePath, false) - // Set HasPreview flags - use cached metadata optimization only when indexing is enabled usedCachedPreview := false if !idx.Config.DisableIndexing && config.Recursive { - // Optimization: For audio files during indexing, check if we can use cached album art info simpleType := strings.Split(itemInfo.Type, "/")[0] if simpleType == "audio" { previousInfo, exists := idx.GetReducedMetadata(fullCombined, false) @@ -397,19 +379,13 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus } } } - // When indexing is disabled or CheckViewable mode, always check directly - // Skip if we already used cached preview data (avoids redundant HasAlbumArt checks) if !usedCachedPreview { setFilePreviewFlags(itemInfo, realPath+"/"+file.Name()) } itemInfo.Size = int64(size) - // Update parent folder preview status for images, videos, and audio with album art - // Use shared function to determine if this file type should bubble up to folder preview if itemInfo.HasPreview && iteminfo.ShouldBubbleUpToFolderPreview(*itemInfo) { hasPreview = true } - - // Wrap ItemInfo in ExtendedItemInfo for files array extItemInfo := iteminfo.ExtendedItemInfo{ ItemInfo: *itemInfo, } @@ -419,23 +395,13 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus } if config.IsRoutineScan { idx.NumFiles++ - // Also update the active scanner's counter idx.incrementScannerFiles() } } } - if totalSize == 0 && idx.Config.Conditionals.ZeroSizeFolders { return nil, errors.ErrNotIndexed } - - if adjustedPath == "/" { - idx.mu.Lock() - idx.DiskUsed = uint64(totalSize) - idx.mu.Unlock() - } - - // Create FileInfo for the current directory (adjustedPath is already normalized with trailing slash) dirFileInfo := &iteminfo.FileInfo{ Path: adjustedPath, Files: fileInfos, @@ -449,29 +415,20 @@ func (idx *Index) GetDirInfo(dirInfo *os.File, stat os.FileInfo, realPath, adjus HasPreview: hasPreview, } dirFileInfo.SortItems() - - // Metadata will be updated by the caller (indexDirectory or GetFsDirInfo) return dirFileInfo, nil } // RecursiveUpdateDirSizes updates parent directory sizes recursively up the tree -// childInfo should have the NEW size, previousSize should be the OLD size func (idx *Index) RecursiveUpdateDirSizes(childInfo *iteminfo.FileInfo, previousSize int64) { parentDir := utils.GetParentDirectoryPath(childInfo.Path) - parentInfo, exists := idx.GetMetadataInfo(parentDir, true) if !exists || parentDir == "" { return } - - // Calculate size delta and update parent previousParentSize := parentInfo.Size sizeDelta := childInfo.Size - previousSize parentInfo.Size = previousParentSize + sizeDelta - idx.UpdateMetadata(parentInfo) - - // Recursively update grandparents idx.RecursiveUpdateDirSizes(parentInfo, previousParentSize) } @@ -483,12 +440,10 @@ func (idx *Index) GetRealPath(relativePath ...string) (string, bool, error) { if ok && cached != "" { return cached, isDir, nil } - // Convert relative path to absolute path absolutePath, err := filepath.Abs(joinedPath) if err != nil { return absolutePath, false, fmt.Errorf("could not get real path: %v, %s", joinedPath, err) } - // Resolve symlinks and get the real path realPath, isDir, err := iteminfo.ResolveSymlinks(absolutePath) if err == nil { RealPathCache.Set(joinedPath, realPath) @@ -502,32 +457,23 @@ func (idx *Index) RefreshFileInfo(opts utils.FileOptions) error { Quick: false, Recursive: opts.Recursive, } - targetPath := opts.Path if !opts.IsDir { targetPath = idx.MakeIndexPath(filepath.Dir(targetPath)) } - - // Get PREVIOUS metadata BEFORE indexing previousInfo, previousExists := idx.GetMetadataInfo(targetPath, true) var previousSize int64 if previousExists { previousSize = previousInfo.Size } - - // Re-index the directory err := idx.indexDirectoryWithOptions(targetPath, config) if err != nil { return err } - - // Get the NEW metadata after indexing newInfo, exists := idx.GetMetadataInfo(targetPath, true) if !exists { return fmt.Errorf("file/folder does not exist in metadata: %s", targetPath) } - - // If size changed, propagate to parents if previousSize != newInfo.Size { idx.RecursiveUpdateDirSizes(newInfo, previousSize) } @@ -536,24 +482,19 @@ func (idx *Index) RefreshFileInfo(opts utils.FileOptions) error { } func isHidden(file os.FileInfo, srcPath string) bool { - // Check if the file starts with a dot (common on Unix systems) if file.Name()[0] == '.' { return true } - if runtime.GOOS == "windows" { return CheckWindowsHidden(filepath.Join(srcPath, file.Name())) } - // Default behavior for non-Windows systems return false } // setFilePreviewFlags determines if a file should have a preview based on its type -// This consolidates the logic used in both GetFsDirInfo and GetDirInfo func setFilePreviewFlags(fileInfo *iteminfo.ItemInfo, realPath string) { simpleType := strings.Split(fileInfo.Type, "/")[0] - // Check for HEIC/HEIF switch fileInfo.Type { case "image/heic", "image/heif": fileInfo.HasPreview = settings.CanConvertImage("heic") @@ -591,11 +532,9 @@ func (idx *Index) IsViewable(isDir bool, adjustedPath string) bool { baseName := filepath.Base(strings.TrimSuffix(adjustedPath, "/")) if isDir { - // Exact match (O(1)) if rule, exists := rules.FolderNames[baseName]; exists && rule.Viewable { return true } - // Prefix/suffix (O(n)) for _, rule := range rules.FolderPaths { if strings.HasPrefix(adjustedPath, rule.FolderPath) && rule.Viewable { return true @@ -612,11 +551,9 @@ func (idx *Index) IsViewable(isDir bool, adjustedPath string) bool { } } } else { - // Exact match (O(1)) if rule, exists := rules.FileNames[baseName]; exists && rule.Viewable { return true } - // Prefix/suffix (O(n)) for _, rule := range rules.FilePaths { if strings.HasPrefix(adjustedPath, rule.FilePath) && rule.Viewable { return true @@ -644,22 +581,13 @@ func (idx *Index) shouldSkip(isDir bool, isHidden bool, fullCombined, baseName s if fullCombined == "/" { return false } - // When indexing is disabled globally, behavior depends on the mode if idx.Config.DisableIndexing { - // If checking viewable (filesystem access), don't skip - show everything from filesystem - if config.CheckViewable { - return false - } - // If indexing mode, skip everything - return true + return !config.CheckViewable } if isDir && config.IsRoutineScan { - // Check NeverWatch: Skip directories with index=true AND neverWatch=true during routine scans - // This allows them to be indexed once but never re-scanned _, ok := rules.NeverWatchPaths[fullCombined] if ok { - // skip scanning, but is viewable return true } } @@ -667,18 +595,13 @@ func (idx *Index) shouldSkip(isDir bool, isHidden bool, fullCombined, baseName s if isDir { if rule, ok := rules.FolderNames[baseName]; ok { if _, ok := rules.FolderPaths[fullCombined]; !ok { - // create a rule so sub folders can be skipped rules.FolderPaths[fullCombined] = rule } return true } - - // Check FolderPaths: Two-stage check for performance - // Stage 1: O(1) exact match (fast path for the folder itself) if _, ok := rules.FolderPaths[fullCombined]; ok { return true } - // Stage 2: O(n) prefix match (for child folders) for path := range rules.FolderPaths { if strings.HasPrefix(fullCombined, path) { return true @@ -699,17 +622,12 @@ func (idx *Index) shouldSkip(isDir bool, isHidden bool, fullCombined, baseName s } } } else { - // Check FileNames (exact match on base name) - O(1) lookup if _, ok := rules.FileNames[baseName]; ok { return true } - - // Check FilePaths: Two-stage check for performance - // Stage 1: O(1) exact match (fast path for the file itself) if _, ok := rules.FilePaths[fullCombined]; ok { return true } - // Stage 2: O(n) prefix match (for files in excluded folders) for path := range rules.FilePaths { if strings.HasPrefix(fullCombined, path) { return true diff --git a/backend/indexing/indexingScanner.go b/backend/indexing/indexingScanner.go index 79dc3aa8b..d396bace7 100644 --- a/backend/indexing/indexingScanner.go +++ b/backend/indexing/indexingScanner.go @@ -8,25 +8,23 @@ import ( "github.com/gtsteffaniak/go-logger/logger" ) -// Scanner represents an independent scanner for a specific directory path -// Each scanner has its own schedule and stats, but only one can run at a time (protected by Index.scanMutex) +// Each scanner has its own schedule and stats, but only one can run at a time type Scanner struct { // Identity scanPath string // "/" for root scanner, "/Documents/" for child scanners - // Per-scanner scheduling (not shared between scanners) currentSchedule int smartModifier time.Duration complexity uint // 0-10 scale: 0=unknown, 1=simple, 2-6=normal, 7-9=complex, 10=highlyComplex fullScanCounter int // every 5th scan is a full scan - // Per-scanner stats (not shared) filesChanged bool lastScanned time.Time quickScanTime int fullScanTime int numDirs uint64 // Local count for this path numFiles uint64 // Local count for this path + scannerSize uint64 // Size contributed by this scanner (for delta calculation) // Reference back to parent index idx *Index @@ -35,63 +33,6 @@ type Scanner struct { stopChan chan struct{} } -// calculateTimeScore returns a 1-10 score based on full scan time -func (s *Scanner) calculateTimeScore() uint { - if s.fullScanTime == 0 { - return 1 // No data yet, assume simple - } - // Time-based thresholds (in seconds) - switch { - case s.fullScanTime < 2: - return 1 - case s.fullScanTime < 5: - return 2 - case s.fullScanTime < 10: - return 3 - case s.fullScanTime < 15: - return 4 - case s.fullScanTime < 30: - return 5 - case s.fullScanTime < 60: - return 6 - case s.fullScanTime < 90: - return 7 - case s.fullScanTime < 120: - return 8 - case s.fullScanTime < 180: - return 9 - default: - return 10 - } -} - -// calculateDirScore returns a 1-10 score based on directory count -func (s *Scanner) calculateDirScore() uint { - // Directory-based thresholds - switch { - case s.numDirs < 2500: - return 1 - case s.numDirs < 5000: - return 2 - case s.numDirs < 10000: - return 3 - case s.numDirs < 25000: - return 4 - case s.numDirs < 50000: - return 5 - case s.numDirs < 100000: - return 6 - case s.numDirs < 250000: - return 7 - case s.numDirs < 500000: - return 8 - case s.numDirs < 1000000: - return 9 - default: - return 10 - } -} - // start begins the scanner's main loop func (s *Scanner) start() { // Do initial scan for all scanners @@ -118,7 +59,6 @@ func (s *Scanner) start() { return case <-time.After(sleepTime): - // Time to scan! But must acquire mutex first s.tryAcquireAndScan() } } @@ -126,19 +66,6 @@ func (s *Scanner) start() { // tryAcquireAndScan attempts to acquire the global scan mutex and run a scan func (s *Scanner) tryAcquireAndScan() { - // Child scanners must wait for root scanner to go first each round - if s.scanPath != "/" { - s.idx.mu.RLock() - lastRootScan := s.idx.lastRootScanTime - myLastScan := s.lastScanned - s.idx.mu.RUnlock() - - // If we've scanned more recently than the root, skip this cycle - if !myLastScan.IsZero() && myLastScan.After(lastRootScan) { - return - } - } - s.idx.scanMutex.Lock() // Mark which scanner is active (for status/logging) @@ -146,14 +73,12 @@ func (s *Scanner) tryAcquireAndScan() { s.idx.activeScannerPath = s.scanPath s.idx.mu.Unlock() - // Determine if quick or full scan - // First scan (fullScanCounter=0) is always full - // Scans 1-4 are quick, scan 5 is full, then repeat quick := s.fullScanCounter > 0 && s.fullScanCounter < 5 s.fullScanCounter++ if s.fullScanCounter >= 5 { s.fullScanCounter = 0 } + s.runIndexing(quick) // Update this scanner's schedule based on results @@ -173,17 +98,15 @@ func (s *Scanner) tryAcquireAndScan() { s.idx.scanMutex.Unlock() - // Aggregate stats to Index level and update status (after releasing active scanner) + // Aggregate stats to Index level and update status s.idx.aggregateStatsFromScanners() } // runIndexing performs the actual indexing work func (s *Scanner) runIndexing(quick bool) { if s.scanPath == "/" { - // ROOT SCANNER: Non-recursive, just scan root directory itself s.runRootScan(quick) } else { - // CHILD SCANNER: Recursive scan of assigned directory s.runChildScan(quick) } @@ -204,6 +127,25 @@ func (s *Scanner) runRootScan(quick bool) { s.numFiles = 0 } + // Track size before scan for delta calculation + previousScannerSize := s.scannerSize + + s.idx.mu.Lock() + if previousScannerSize > 0 { + if s.idx.totalSize >= previousScannerSize { + s.idx.totalSize -= previousScannerSize + } else { + // Safety check: if totalSize is less than previousScannerSize, something is wrong + // Reset to 0 to avoid underflow + logger.Warningf("[%s] Scanner [%s] WARNING: totalSize (%d) < previousScannerSize (%d), resetting totalSize to 0", s.idx.Name, s.scanPath, s.idx.totalSize, previousScannerSize) + s.idx.totalSize = 0 + } + } + indexSizeBefore := s.idx.totalSize + s.idx.mu.Unlock() + + logger.Debugf("[%s] Scanner [%s] START: indexSizeBefore=%d, previousScannerSize=%d (quick=%v)", s.idx.Name, s.scanPath, indexSizeBefore, previousScannerSize, quick) + s.filesChanged = false startTime := time.Now() @@ -212,6 +154,12 @@ func (s *Scanner) runRootScan(quick bool) { logger.Errorf("Root scanner error: %v", err) } + // Calculate delta for this scanner + s.idx.mu.RLock() + indexSizeAfter := s.idx.totalSize + s.idx.mu.RUnlock() + newScannerSize := indexSizeAfter - indexSizeBefore + s.scannerSize = newScannerSize scanDuration := int(time.Since(startTime).Seconds()) if quick { s.quickScanTime = scanDuration @@ -219,8 +167,6 @@ func (s *Scanner) runRootScan(quick bool) { s.fullScanTime = scanDuration s.updateComplexity() } - - // Check for new top-level directories and create scanners for them s.checkForNewChildDirectories() } @@ -237,15 +183,21 @@ func (s *Scanner) runChildScan(quick bool) { s.numDirs = 0 s.numFiles = 0 } - + s.idx.mu.RLock() + indexSizeBefore := s.idx.totalSize + s.idx.mu.RUnlock() s.filesChanged = false startTime := time.Now() - err := s.idx.indexDirectory(s.scanPath, config) if err != nil { logger.Errorf("Scanner [%s] error: %v", s.scanPath, err) } + s.idx.mu.RLock() + indexSizeAfter := s.idx.totalSize + s.idx.mu.RUnlock() + newScannerSize := indexSizeAfter - indexSizeBefore + s.scannerSize = newScannerSize scanDuration := int(time.Since(startTime).Seconds()) if quick { s.quickScanTime = scanDuration @@ -323,35 +275,23 @@ func (s *Scanner) getTopLevelDirs() []string { for _, file := range files { baseName := file.Name() - - // Skip files - only create scanners for directories if !file.IsDir() { - // Note: includeRootItems may contain files, but we only create scanners for directories continue } - dirPath := "/" + baseName + "/" - - // Skip directories in omit list if omitList[baseName] { logger.Debugf("Skipping scanner creation for omitted directory: %s", dirPath) continue } - - // Check if we should include this directory (respects includeRootItems filter) - // When includeRootItems is set, ONLY those items (that are directories) get scanners if !s.idx.shouldInclude(baseName) { logger.Debugf("Skipping scanner creation for non-included directory: %s", dirPath) continue } - - // Check if this directory should be excluded from indexing (respects exclusion rules) hidden := isHidden(file, s.idx.Path+dirPath) if s.idx.shouldSkip(true, hidden, dirPath, baseName, actionConfig{}) { logger.Debugf("Skipping scanner creation for excluded directory: %s", dirPath) continue } - dirs = append(dirs, dirPath) } @@ -360,10 +300,7 @@ func (s *Scanner) getTopLevelDirs() []string { // calculateSleepTime determines how long to wait before the next scan func (s *Scanner) calculateSleepTime() time.Duration { - // Get base schedule time and apply complexity modifier sleepTime := scanSchedule[s.currentSchedule] + s.smartModifier - - // Allow manual override via config if s.idx.Config.IndexingInterval > 0 { sleepTime = time.Duration(s.idx.Config.IndexingInterval) * time.Minute } @@ -404,19 +341,9 @@ func (s *Scanner) updateSchedule() { } // updateComplexity calculates the complexity level (1-10) for this scanner's directory -// 0: unknown, 1: simple, 2-6: normal, 7-9: complex, 10: highlyComplex +// 0: unknown func (s *Scanner) updateComplexity() { - // Calculate complexity based on both scan time and directory count - timeScore := s.calculateTimeScore() - dirScore := s.calculateDirScore() - - // Use the higher score (more conservative approach) - complexity := timeScore - if dirScore > timeScore { - complexity = dirScore - } - - s.complexity = complexity + s.complexity = calculateComplexity(s.fullScanTime, s.numDirs) // Set smartModifier based on complexity level if modifier, ok := complexityModifier[s.complexity]; ok { diff --git a/backend/indexing/indexingSchedule.go b/backend/indexing/indexingSchedule.go index c83a30268..06d7e412e 100644 --- a/backend/indexing/indexingSchedule.go +++ b/backend/indexing/indexingSchedule.go @@ -2,6 +2,7 @@ package indexing import ( "encoding/json" + "strconv" "time" "github.com/gtsteffaniak/filebrowser/backend/events" @@ -23,8 +24,6 @@ var scanSchedule = map[int]time.Duration{ } // complexityModifier defines time adjustments based on complexity level (0-10) -// 0: unknown (not yet scanned), 1: simple, 2-6: normal, 7-9: complex, 10: highlyComplex -// Each level gets progressively more aggressive with scan timing adjustments var complexityModifier = map[uint]time.Duration{ 0: 0 * time.Minute, // unknown: no modifier 1: -4 * time.Minute, // simple: scan more frequently @@ -39,6 +38,72 @@ var complexityModifier = map[uint]time.Duration{ 10: 16 * time.Minute, // highlyComplex: scan less frequently } +// calculateTimeScore returns a 1-10 score based on full scan time +func calculateTimeScore(fullScanTime int) uint { + if fullScanTime == 0 { + return 1 // No data yet, assume simple + } + switch { + case fullScanTime < 2: + return 1 + case fullScanTime < 5: + return 2 + case fullScanTime < 10: + return 3 + case fullScanTime < 15: + return 4 + case fullScanTime < 30: + return 5 + case fullScanTime < 60: + return 6 + case fullScanTime < 90: + return 7 + case fullScanTime < 120: + return 8 + case fullScanTime < 180: + return 9 + default: + return 10 + } +} + +// calculateDirScore returns a 1-10 score based on directory count +func calculateDirScore(numDirs uint64) uint { + // Directory-based thresholds + switch { + case numDirs < 2500: + return 1 + case numDirs < 5000: + return 2 + case numDirs < 10000: + return 3 + case numDirs < 25000: + return 4 + case numDirs < 50000: + return 5 + case numDirs < 100000: + return 6 + case numDirs < 250000: + return 7 + case numDirs < 500000: + return 8 + case numDirs < 1000000: + return 9 + default: + return 10 + } +} + +func calculateComplexity(fullScanTime int, numDirs uint64) uint { + timeScore := calculateTimeScore(fullScanTime) + dirScore := calculateDirScore(numDirs) + complexity := timeScore + if dirScore > timeScore { + complexity = dirScore + } + return complexity +} + var fullScanAnchor = 3 // index of the schedule for a full scan // Removed: Old single-scanner implementation - replaced by multi-scanner system in indexingScanner.go @@ -108,15 +173,14 @@ func (idx *Index) garbageCollection() { idx.DirectoriesLedger = make(map[string]struct{}) } -// Removed: UpdateSchedule - now handled per-scanner in Scanner.updateSchedule() - func (idx *Index) SendSourceUpdateEvent() error { if idx.mock { logger.Debug("Skipping source update event for mock index.") return nil } - reducedIndex, err := GetIndexInfo(idx.Name) + reducedIndex, err := GetIndexInfo(idx.Name, false) if err != nil { + logger.Errorf("[%s] Error getting index info: %v", idx.Name, err) return err } sourceAsMap := map[string]ReducedIndex{ @@ -124,14 +188,16 @@ func (idx *Index) SendSourceUpdateEvent() error { } message, err := json.Marshal(sourceAsMap) if err != nil { + logger.Errorf("[%s] Error marshaling source update: %v", idx.Name, err) return err } - events.SendSourceUpdate(idx.Name, string(message)) + // Quote the JSON string so it's sent as a string in the SSE message, not as an object + // The sendEvent function expects the message to be properly quoted (like "\"connection established\"") + quotedMessage := strconv.Quote(string(message)) + events.SendSourceUpdate(idx.Name, quotedMessage) return nil } -// Removed: RunIndexing - replaced by multi-scanner system where each scanner handles its own indexing - // setupMultiScanner creates and starts the multi-scanner system // Creates a root scanner (non-recursive) and child scanners for each top-level directory func (idx *Index) setupMultiScanner() { @@ -200,19 +266,25 @@ func (idx *Index) setupIndexingScanners() { // aggregateStatsFromScanners aggregates stats from all scanners to Index-level stats func (idx *Index) aggregateStatsFromScanners() { idx.mu.Lock() - defer idx.mu.Unlock() if len(idx.scanners) == 0 { + idx.mu.Unlock() return } + // Store previous stats and status to detect changes + // Use stored previous totalSize for change detection + prevNumDirs := idx.NumDirs + prevNumFiles := idx.NumFiles + prevDiskUsed := idx.previousTotalSize // Use stored previous value (0 on first call, which will trigger initial event) + prevStatus := idx.Status + // Aggregate stats from all scanners var totalDirs uint64 = 0 var totalFiles uint64 = 0 var totalQuickScanTime = 0 var totalFullScanTime = 0 var mostRecentScan time.Time - var maxComplexity uint = 0 // Start at 0 (unknown) allScannedAtLeastOnce := true for _, scanner := range idx.scanners { @@ -220,66 +292,53 @@ func (idx *Index) aggregateStatsFromScanners() { totalFiles += scanner.numFiles totalQuickScanTime += scanner.quickScanTime totalFullScanTime += scanner.fullScanTime - - // Track most recent scan if scanner.lastScanned.After(mostRecentScan) { mostRecentScan = scanner.lastScanned } - - // Check if all scanners have scanned at least once if scanner.lastScanned.IsZero() { allScannedAtLeastOnce = false } - if !allScannedAtLeastOnce { - continue - } - - // Track highest complexity (most conservative assessment) - // Only consider scanners that have been assessed (complexity > 0) - if scanner.complexity > 0 && scanner.complexity > maxComplexity { - maxComplexity = scanner.complexity - } - } - if allScannedAtLeastOnce && idx.FullScanTime == 0 { - maxComplexity = 1 // assess as simple because it took 0 seconds total } - - // Update Index-level stats idx.NumDirs = totalDirs idx.NumFiles = totalFiles idx.QuickScanTime = totalQuickScanTime idx.FullScanTime = totalFullScanTime - idx.Complexity = maxComplexity - - // Update last indexed time + if allScannedAtLeastOnce { + idx.Complexity = calculateComplexity(totalFullScanTime, totalDirs) + } else { + idx.Complexity = 0 + } if !mostRecentScan.IsZero() { idx.LastIndexed = mostRecentScan idx.LastIndexedUnix = mostRecentScan.Unix() idx.wasIndexed = true } - - // Log first complete scan round (once) if allScannedAtLeastOnce && !idx.hasLoggedInitialScan { totalDuration := time.Since(idx.initialScanStartTime) truncatedToSecond := totalDuration.Truncate(time.Second) logger.Debugf("Time spent indexing [%v]: %v seconds", idx.Name, truncatedToSecond) idx.hasLoggedInitialScan = true } - - // Update status: if all scanners have completed at least one scan, mark as READY if allScannedAtLeastOnce && idx.activeScannerPath == "" { idx.Status = READY - // Send update event to notify clients - idx.mu.Unlock() + } else if idx.activeScannerPath != "" { + idx.Status = INDEXING + } + newDiskUsed := idx.totalSize + newStatus := idx.Status + idx.mu.Unlock() + statsChanged := prevNumDirs != totalDirs || prevNumFiles != totalFiles || prevDiskUsed != newDiskUsed + statusChanged := prevStatus != newStatus + if statsChanged || statusChanged { err := idx.SendSourceUpdateEvent() - idx.mu.Lock() if err != nil { logger.Errorf("Error sending source update event: %v", err) } - } else if idx.activeScannerPath != "" { - idx.Status = INDEXING + // Update previousTotalSize after sending event so next aggregation can detect changes + idx.mu.Lock() + idx.previousTotalSize = newDiskUsed + idx.mu.Unlock() } - } // GetScannerStatus returns detailed information about all active scanners diff --git a/backend/indexing/mutate.go b/backend/indexing/mutate.go index 7e400f2a3..3922181b1 100644 --- a/backend/indexing/mutate.go +++ b/backend/indexing/mutate.go @@ -177,15 +177,23 @@ func (idx *Index) GetDirectories() map[string]*iteminfo.FileInfo { return idx.Directories } -func GetIndexInfo(sourceName string) (ReducedIndex, error) { +func GetIndexInfo(sourceName string, forceCacheRefresh bool) (ReducedIndex, error) { idx, ok := indexes[sourceName] if !ok { return ReducedIndex{}, fmt.Errorf("index %s not found", sourceName) } + + // Only update disk total if cache is missing or explicitly forced + // The "used" value comes from totalSize and is always current sourcePath := idx.Path cacheKey := "usageCache-" + sourceName + if forceCacheRefresh { + // Invalidate cache to force update + utils.DiskUsageCache.Delete(cacheKey) + } _, ok = utils.DiskUsageCache.Get(cacheKey) if !ok { + // Only fetch disk total if not cached (this is expensive, so we cache it) totalBytes, err := getPartitionSize(sourcePath) if err != nil { idx.mu.Lock() @@ -215,7 +223,13 @@ func GetIndexInfo(sourceName string) (ReducedIndex, error) { } idx.mu.RUnlock() + // Get fresh values from the index (with lock to ensure consistency) + idx.mu.RLock() reducedIdx := idx.ReducedIndex + // Ensure DiskUsed is up to date from totalSize + reducedIdx.DiskUsed = idx.totalSize + reducedIdx.DiskTotal = idx.DiskTotal reducedIdx.Scanners = scannerInfos + idx.mu.RUnlock() return reducedIdx, nil } diff --git a/backend/indexing/unix.go b/backend/indexing/unix.go index 22aafa9f4..b21780c81 100644 --- a/backend/indexing/unix.go +++ b/backend/indexing/unix.go @@ -62,7 +62,8 @@ func getFileDetails(sys any, filePath string) (uint64, uint64, uint64, bool) { // handleFile processes a file and returns its size and whether it should be counted // On Unix, always uses syscall to get allocated size (du-like behavior) -func (idx *Index) handleFile(file os.FileInfo, fullCombined string, realFilePath string) (size uint64, shouldCountSize bool) { +// isRoutineScan: if true, updates the global totalSize; if false (API calls), only returns size +func (idx *Index) handleFile(file os.FileInfo, fullCombined string, realFilePath string, isRoutineScan bool) (size uint64, shouldCountSize bool) { var realSize uint64 var nlink uint64 var ino uint64 @@ -87,14 +88,20 @@ func (idx *Index) handleFile(file os.FileInfo, fullCombined string, realFilePath // First time seeing this inode. idx.processedInodes[ino] = struct{}{} idx.FoundHardLinks[fullCombined] = realSize + // Only update totalSize during routine scans (not API calls) + if isRoutineScan { idx.totalSize += realSize + } return realSize, true // Count size for directory total. } // It's a regular file. + // Only update totalSize during routine scans (not API calls) + if isRoutineScan { idx.mu.Lock() idx.totalSize += realSize idx.mu.Unlock() + } return realSize, true // Count size. } diff --git a/backend/indexing/windows.go b/backend/indexing/windows.go index fe73425a9..4c9943e9f 100644 --- a/backend/indexing/windows.go +++ b/backend/indexing/windows.go @@ -57,14 +57,18 @@ func getFileDetails(sys any, filePath string) (uint64, uint64, uint64, bool) { // handleFile processes a file and returns its size and whether it should be counted // On Windows, uses file.Size() directly (no syscall support for allocated size) -func (idx *Index) handleFile(file os.FileInfo, fullCombined string, realFilePath string) (size uint64, shouldCountSize bool) { +// isRoutineScan: if true, updates the global totalSize; if false (API calls), only returns size +func (idx *Index) handleFile(file os.FileInfo, fullCombined string, realFilePath string, isRoutineScan bool) (size uint64, shouldCountSize bool) { // On Windows, just use the actual file size realSize := uint64(file.Size()) // Windows doesn't support hard links in the same way, so always count size + // Only update totalSize during routine scans (not API calls) + if isRoutineScan { idx.mu.Lock() idx.totalSize += realSize idx.mu.Unlock() + } return realSize, true } diff --git a/backend/swagger/docs/docs.go b/backend/swagger/docs/docs.go index 865c38c81..4be61f9bb 100644 --- a/backend/swagger/docs/docs.go +++ b/backend/swagger/docs/docs.go @@ -1214,13 +1214,10 @@ const docTemplate = `{ }, "/api/raw": { "get": { - "description": "Returns the raw content of a file, multiple files, or a directory. Supports downloading files as archives in various formats.", + "description": "Returns the raw content of a file, multiple files, or a directory. Supports downloading files as archives in various formats.\n\n**Filename Encoding:**\n- The Content-Disposition header will always include both:\n1. ` + "`" + `filename=\"...\"` + "`" + `: An ASCII-safe version of the filename for compatibility.\n2. ` + "`" + `filename*=utf-8”...` + "`" + `: The full UTF-8 encoded filename (RFC 6266/5987) for modern clients.", "consumes": [ "application/json" ], - "produces": [ - "application/json" - ], "tags": [ "Resources" ], diff --git a/backend/swagger/docs/swagger.json b/backend/swagger/docs/swagger.json index 75a5858b4..03a0470b8 100644 --- a/backend/swagger/docs/swagger.json +++ b/backend/swagger/docs/swagger.json @@ -1203,13 +1203,10 @@ }, "/api/raw": { "get": { - "description": "Returns the raw content of a file, multiple files, or a directory. Supports downloading files as archives in various formats.", + "description": "Returns the raw content of a file, multiple files, or a directory. Supports downloading files as archives in various formats.\n\n**Filename Encoding:**\n- The Content-Disposition header will always include both:\n1. `filename=\"...\"`: An ASCII-safe version of the filename for compatibility.\n2. `filename*=utf-8”...`: The full UTF-8 encoded filename (RFC 6266/5987) for modern clients.", "consumes": [ "application/json" ], - "produces": [ - "application/json" - ], "tags": [ "Resources" ], diff --git a/backend/swagger/docs/swagger.yaml b/backend/swagger/docs/swagger.yaml index eb379636d..0a6e3a9e6 100644 --- a/backend/swagger/docs/swagger.yaml +++ b/backend/swagger/docs/swagger.yaml @@ -2366,8 +2366,13 @@ paths: get: consumes: - application/json - description: Returns the raw content of a file, multiple files, or a directory. - Supports downloading files as archives in various formats. + description: |- + Returns the raw content of a file, multiple files, or a directory. Supports downloading files as archives in various formats. + + **Filename Encoding:** + - The Content-Disposition header will always include both: + 1. `filename="..."`: An ASCII-safe version of the filename for compatibility. + 2. `filename*=utf-8”...`: The full UTF-8 encoded filename (RFC 6266/5987) for modern clients. parameters: - description: a list of files in the following format 'source::filename' and separated by '||' with additional items in the list. (required) @@ -2385,8 +2390,6 @@ paths: in: query name: algo type: string - produces: - - application/json responses: "200": description: Raw file or directory content, or archive for multiple files diff --git a/frontend/public/index.html b/frontend/public/index.html index d54f52241..4f6d97153 100644 --- a/frontend/public/index.html +++ b/frontend/public/index.html @@ -5,7 +5,7 @@ -