diff options
| -rw-r--r-- | Cargo.lock | 10 | ||||
| -rw-r--r-- | Cargo.toml | 1 | ||||
| -rw-r--r-- | src/epub.rs | 91 | ||||
| -rw-r--r-- | src/main.rs | 1 |
4 files changed, 103 insertions, 0 deletions
@@ -1081,6 +1081,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "unicode-normalization", ] [[package]] @@ -1506,6 +1507,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -13,3 +13,4 @@ serde_json = "1.0" tokio = { version = "1.49", features = ["rt-multi-thread", "macros"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } +unicode-normalization = "0.1" diff --git a/src/epub.rs b/src/epub.rs new file mode 100644 index 0000000..c984dde --- /dev/null +++ b/src/epub.rs @@ -0,0 +1,91 @@ +use std::path::{Path, PathBuf}; +use unicode_normalization::UnicodeNormalization; + +pub struct EpubSkeleton { + /// Books/<book_title (book_id)>/ + pub root: PathBuf, + pub meta_inf: PathBuf, + pub oebps: PathBuf, +} + +impl EpubSkeleton { + pub fn plan(base_books_dir: &Path, title: &str, bookid: &str) -> Self { + // Maximum number of bytes in a filename. + const MAX_BYTES: usize = 255; + let clean_title = sanitize_filename(title); + let root_name = if !clean_title.is_empty() { + // Title length should take into account the bookid, space, and () characters. + let title_max_length = MAX_BYTES.saturating_sub(3 + bookid.len()); + let truncated_title = truncate_utf8_by_byte(&clean_title, title_max_length); + format!("{} ({})", truncated_title, bookid) + } else { + format!("({})", bookid) + }; + let root_dir = base_books_dir.join(root_name); + Self { + meta_inf: root_dir.join("META-INF"), + oebps: root_dir.join("OEBPS"), + root: root_dir, + } + } +} + +/// Sanitize a filename component for cross‑platform compatibility. +/// Applies sensible defaults: +/// - Normalize to NFC +/// - Replace illegal characters: <>:"/\\|?* +/// - Remove control characters +/// - Collapse whitespace +/// - Trim whitespace +fn sanitize_filename(input: &str) -> String { + // Normalize to NFC to ensure consistency - characters displayed the same are stored the same. + let mut s = input.nfc().collect::<String>(); + + // Replace illegal Windows/FAT characters + control chars + const ILLEGAL: &[char] = &['<', '>', ':', '"', '/', '\\', '|', '?', '*']; + let mut cleaned = String::with_capacity(s.len()); + + for ch in s.chars() { + if ch.is_control() || ILLEGAL.contains(&ch) { + cleaned.push('_'); + } else { + cleaned.push(ch); + } + } + s = cleaned; + + // Collapse whitespace + let mut cleaned = String::with_capacity(s.len()); + let mut prev_was_whitespace = false; + for ch in s.chars() { + if ch.is_whitespace() { + if !prev_was_whitespace { + cleaned.push(' '); + prev_was_whitespace = true; + } + } else { + cleaned.push(ch); + prev_was_whitespace = false; + } + } + cleaned.trim().to_string() +} + +/// Truncate a UTF‑8 string safely without splitting codepoints. +fn truncate_utf8_by_byte(s: &str, max_bytes: usize) -> &str { + if s.len() <= max_bytes { + return s; + } + + let mut end = max_bytes; + // Back up until we end with a non-continuation byte. + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + + if end == 0 { + return ""; + } + + &s[..end] +} diff --git a/src/main.rs b/src/main.rs index d393a55..b963850 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod cli; mod config; mod cookies; mod display; +mod epub; mod http_client; mod orly; |
