diff options
| author | A Farzat <a@farzat.xyz> | 2026-02-13 19:50:52 +0300 |
|---|---|---|
| committer | A Farzat <a@farzat.xyz> | 2026-02-13 19:59:25 +0300 |
| commit | edf46d8965752725ef3305c9d01decc038457db0 (patch) | |
| tree | a5fc876f65dc53f350a58d3816c41942dc3ce570 | |
| parent | 57bc69a7f9af497526695e5a0bfbc60939f667e9 (diff) | |
| download | safaribooks-rs-edf46d8965752725ef3305c9d01decc038457db0.tar.gz safaribooks-rs-edf46d8965752725ef3305c9d01decc038457db0.zip | |
Create the basic directory structure for epub
The directory name needs to be sanitized such that it is valid on all
major Operating systems and filesystems. This includes replacing invalid
characters and limiting the filename to 255 bytes.
| -rw-r--r-- | Cargo.lock | 10 | ||||
| -rw-r--r-- | Cargo.toml | 1 | ||||
| -rw-r--r-- | src/epub.rs | 91 | ||||
| -rw-r--r-- | src/main.rs | 1 |
4 files changed, 103 insertions, 0 deletions
@@ -1081,6 +1081,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "unicode-normalization", ] [[package]] @@ -1506,6 +1507,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -13,3 +13,4 @@ serde_json = "1.0" tokio = { version = "1.49", features = ["rt-multi-thread", "macros"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } +unicode-normalization = "0.1" diff --git a/src/epub.rs b/src/epub.rs new file mode 100644 index 0000000..c984dde --- /dev/null +++ b/src/epub.rs @@ -0,0 +1,91 @@ +use std::path::{Path, PathBuf}; +use unicode_normalization::UnicodeNormalization; + +pub struct EpubSkeleton { + /// Books/<book_title (book_id)>/ + pub root: PathBuf, + pub meta_inf: PathBuf, + pub oebps: PathBuf, +} + +impl EpubSkeleton { + pub fn plan(base_books_dir: &Path, title: &str, bookid: &str) -> Self { + // Maximum number of bytes in a filename. + const MAX_BYTES: usize = 255; + let clean_title = sanitize_filename(title); + let root_name = if !clean_title.is_empty() { + // Title length should take into account the bookid, space, and () characters. + let title_max_length = MAX_BYTES.saturating_sub(3 + bookid.len()); + let truncated_title = truncate_utf8_by_byte(&clean_title, title_max_length); + format!("{} ({})", truncated_title, bookid) + } else { + format!("({})", bookid) + }; + let root_dir = base_books_dir.join(root_name); + Self { + meta_inf: root_dir.join("META-INF"), + oebps: root_dir.join("OEBPS"), + root: root_dir, + } + } +} + +/// Sanitize a filename component for cross‑platform compatibility. +/// Applies sensible defaults: +/// - Normalize to NFC +/// - Replace illegal characters: <>:"/\\|?* +/// - Remove control characters +/// - Collapse whitespace +/// - Trim whitespace +fn sanitize_filename(input: &str) -> String { + // Normalize to NFC to ensure consistency - characters displayed the same are stored the same. + let mut s = input.nfc().collect::<String>(); + + // Replace illegal Windows/FAT characters + control chars + const ILLEGAL: &[char] = &['<', '>', ':', '"', '/', '\\', '|', '?', '*']; + let mut cleaned = String::with_capacity(s.len()); + + for ch in s.chars() { + if ch.is_control() || ILLEGAL.contains(&ch) { + cleaned.push('_'); + } else { + cleaned.push(ch); + } + } + s = cleaned; + + // Collapse whitespace + let mut cleaned = String::with_capacity(s.len()); + let mut prev_was_whitespace = false; + for ch in s.chars() { + if ch.is_whitespace() { + if !prev_was_whitespace { + cleaned.push(' '); + prev_was_whitespace = true; + } + } else { + cleaned.push(ch); + prev_was_whitespace = false; + } + } + cleaned.trim().to_string() +} + +/// Truncate a UTF‑8 string safely without splitting codepoints. +fn truncate_utf8_by_byte(s: &str, max_bytes: usize) -> &str { + if s.len() <= max_bytes { + return s; + } + + let mut end = max_bytes; + // Back up until we end with a non-continuation byte. + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + + if end == 0 { + return ""; + } + + &s[..end] +} diff --git a/src/main.rs b/src/main.rs index d393a55..b963850 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod cli; mod config; mod cookies; mod display; +mod epub; mod http_client; mod orly; |
