From edf46d8965752725ef3305c9d01decc038457db0 Mon Sep 17 00:00:00 2001 From: A Farzat Date: Fri, 13 Feb 2026 19:50:52 +0300 Subject: Create the basic directory structure for epub The directory name needs to be sanitized such that it is valid on all major Operating systems and filesystems. This includes replacing invalid characters and limiting the filename to 255 bytes. --- Cargo.lock | 10 +++++++ Cargo.toml | 1 + src/epub.rs | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 1 + 4 files changed, 103 insertions(+) create mode 100644 src/epub.rs diff --git a/Cargo.lock b/Cargo.lock index 1b1a448..e971da4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1081,6 +1081,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "unicode-normalization", ] [[package]] @@ -1505,6 +1506,15 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index 70b1e11..efd6672 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,4 @@ serde_json = "1.0" tokio = { version = "1.49", features = ["rt-multi-thread", "macros"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } +unicode-normalization = "0.1" diff --git a/src/epub.rs b/src/epub.rs new file mode 100644 index 0000000..c984dde --- /dev/null +++ b/src/epub.rs @@ -0,0 +1,91 @@ +use std::path::{Path, PathBuf}; +use unicode_normalization::UnicodeNormalization; + +pub struct EpubSkeleton { + /// Books// + pub root: PathBuf, + pub meta_inf: PathBuf, + pub oebps: PathBuf, +} + +impl EpubSkeleton { + pub fn plan(base_books_dir: &Path, title: &str, bookid: &str) -> Self { + // Maximum number of bytes in a filename. + const MAX_BYTES: usize = 255; + let clean_title = sanitize_filename(title); + let root_name = if !clean_title.is_empty() { + // Title length should take into account the bookid, space, and () characters. + let title_max_length = MAX_BYTES.saturating_sub(3 + bookid.len()); + let truncated_title = truncate_utf8_by_byte(&clean_title, title_max_length); + format!("{} ({})", truncated_title, bookid) + } else { + format!("({})", bookid) + }; + let root_dir = base_books_dir.join(root_name); + Self { + meta_inf: root_dir.join("META-INF"), + oebps: root_dir.join("OEBPS"), + root: root_dir, + } + } +} + +/// Sanitize a filename component for cross‑platform compatibility. +/// Applies sensible defaults: +/// - Normalize to NFC +/// - Replace illegal characters: <>:"/\\|?* +/// - Remove control characters +/// - Collapse whitespace +/// - Trim whitespace +fn sanitize_filename(input: &str) -> String { + // Normalize to NFC to ensure consistency - characters displayed the same are stored the same. + let mut s = input.nfc().collect::(); + + // Replace illegal Windows/FAT characters + control chars + const ILLEGAL: &[char] = &['<', '>', ':', '"', '/', '\\', '|', '?', '*']; + let mut cleaned = String::with_capacity(s.len()); + + for ch in s.chars() { + if ch.is_control() || ILLEGAL.contains(&ch) { + cleaned.push('_'); + } else { + cleaned.push(ch); + } + } + s = cleaned; + + // Collapse whitespace + let mut cleaned = String::with_capacity(s.len()); + let mut prev_was_whitespace = false; + for ch in s.chars() { + if ch.is_whitespace() { + if !prev_was_whitespace { + cleaned.push(' '); + prev_was_whitespace = true; + } + } else { + cleaned.push(ch); + prev_was_whitespace = false; + } + } + cleaned.trim().to_string() +} + +/// Truncate a UTF‑8 string safely without splitting codepoints. +fn truncate_utf8_by_byte(s: &str, max_bytes: usize) -> &str { + if s.len() <= max_bytes { + return s; + } + + let mut end = max_bytes; + // Back up until we end with a non-continuation byte. + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + + if end == 0 { + return ""; + } + + &s[..end] +} diff --git a/src/main.rs b/src/main.rs index d393a55..b963850 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod cli; mod config; mod cookies; mod display; +mod epub; mod http_client; mod orly; -- cgit v1.2.3-70-g09d2