Create the basic directory structure for epub

The directory name needs to be sanitized such that it is valid on all major Operating systems and filesystems. This includes replacing invalid characters and limiting the filename to 255 bytes.
author: A Farzat <a@farzat.xyz> 2026-02-13 19:50:52 +0300
committer: A Farzat <a@farzat.xyz> 2026-02-13 19:59:25 +0300
commit: edf46d8965752725ef3305c9d01decc038457db0 (patch)
tree: a5fc876f65dc53f350a58d3816c41942dc3ce570
parent: 57bc69a7f9af497526695e5a0bfbc60939f667e9 (diff)
download: safaribooks-rs-edf46d8965752725ef3305c9d01decc038457db0.tar.gz
safaribooks-rs-edf46d8965752725ef3305c9d01decc038457db0.zip
4 files changed, 103 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 1b1a448..e971da4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1081,6 +1081,7 @@ dependencies = [
  "tokio",
  "tracing",
  "tracing-subscriber",
+ "unicode-normalization",
 ]
 
 [[package]]
@@ -1506,6 +1507,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
 
 [[package]]
+name = "unicode-normalization"
+version = "0.1.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
 name = "untrusted"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 70b1e11..efd6672 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,3 +13,4 @@ serde_json = "1.0"
 tokio = { version = "1.49", features = ["rt-multi-thread", "macros"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
+unicode-normalization = "0.1"
diff --git a/src/epub.rs b/src/epub.rs
new file mode 100644
index 0000000..c984dde
--- /dev/null
+++ b/src/epub.rs
@@ -0,0 +1,91 @@
+use std::path::{Path, PathBuf};
+use unicode_normalization::UnicodeNormalization;
+
+pub struct EpubSkeleton {
+    /// Books/<book_title (book_id)>/
+    pub root: PathBuf,
+    pub meta_inf: PathBuf,
+    pub oebps: PathBuf,
+}
+
+impl EpubSkeleton {
+    pub fn plan(base_books_dir: &Path, title: &str, bookid: &str) -> Self {
+        // Maximum number of bytes in a filename.
+        const MAX_BYTES: usize = 255;
+        let clean_title = sanitize_filename(title);
+        let root_name = if !clean_title.is_empty() {
+            // Title length should take into account the bookid, space, and () characters.
+            let title_max_length = MAX_BYTES.saturating_sub(3 + bookid.len());
+            let truncated_title = truncate_utf8_by_byte(&clean_title, title_max_length);
+            format!("{} ({})", truncated_title, bookid)
+        } else {
+            format!("({})", bookid)
+        };
+        let root_dir = base_books_dir.join(root_name);
+        Self {
+            meta_inf: root_dir.join("META-INF"),
+            oebps: root_dir.join("OEBPS"),
+            root: root_dir,
+        }
+    }
+}
+
+/// Sanitize a filename component for cross‑platform compatibility.
+/// Applies sensible defaults:
+/// - Normalize to NFC
+/// - Replace illegal characters: <>:"/\\|?*
+/// - Remove control characters
+/// - Collapse whitespace
+/// - Trim whitespace
+fn sanitize_filename(input: &str) -> String {
+    // Normalize to NFC to ensure consistency - characters displayed the same are stored the same.
+    let mut s = input.nfc().collect::<String>();
+
+    // Replace illegal Windows/FAT characters + control chars
+    const ILLEGAL: &[char] = &['<', '>', ':', '"', '/', '\\', '|', '?', '*'];
+    let mut cleaned = String::with_capacity(s.len());
+
+    for ch in s.chars() {
+        if ch.is_control() || ILLEGAL.contains(&ch) {
+            cleaned.push('_');
+        } else {
+            cleaned.push(ch);
+        }
+    }
+    s = cleaned;
+
+    // Collapse whitespace
+    let mut cleaned = String::with_capacity(s.len());
+    let mut prev_was_whitespace = false;
+    for ch in s.chars() {
+        if ch.is_whitespace() {
+            if !prev_was_whitespace {
+                cleaned.push(' ');
+                prev_was_whitespace = true;
+            }
+        } else {
+            cleaned.push(ch);
+            prev_was_whitespace = false;
+        }
+    }
+    cleaned.trim().to_string()
+}
+
+/// Truncate a UTF‑8 string safely without splitting codepoints.
+fn truncate_utf8_by_byte(s: &str, max_bytes: usize) -> &str {
+    if s.len() <= max_bytes {
+        return s;
+    }
+
+    let mut end = max_bytes;
+    // Back up until we end with a non-continuation byte.
+    while end > 0 && !s.is_char_boundary(end) {
+        end -= 1;
+    }
+
+    if end == 0 {
+        return "";
+    }
+
+    &s[..end]
+}
diff --git a/src/main.rs b/src/main.rs
index d393a55..b963850 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,6 +2,7 @@ mod cli;
 mod config;
 mod cookies;
 mod display;
+mod epub;
 mod http_client;
 mod orly;
author	A Farzat <a@farzat.xyz>	2026-02-13 19:50:52 +0300
committer	A Farzat <a@farzat.xyz>	2026-02-13 19:59:25 +0300
commit	edf46d8965752725ef3305c9d01decc038457db0 (patch)
tree	a5fc876f65dc53f350a58d3816c41942dc3ce570
parent	57bc69a7f9af497526695e5a0bfbc60939f667e9 (diff)
download	safaribooks-rs-edf46d8965752725ef3305c9d01decc038457db0.tar.gz safaribooks-rs-edf46d8965752725ef3305c9d01decc038457db0.zip