Skip to main content

problemreductions/models/misc/
minimum_internal_macro_data_compression.rs

1//! Minimum Internal Macro Data Compression problem implementation.
2//!
3//! Given an alphabet Σ, a string s ∈ Σ*, and a pointer cost h,
4//! find a single compressed string C ∈ (Σ ∪ {pointers})* minimizing the cost
5//! |C| + (h−1) × (number of pointer occurrences in C),
6//! such that s can be obtained from C by resolving all pointer references
7//! within C itself (left-to-right, greedy longest match).
8//!
9//! Unlike external macro compression, there is no separate dictionary — the
10//! compressed string C serves as both dictionary and output.
11//!
12//! This problem is NP-hard (Storer, 1977; Storer & Szymanski, 1978).
13//! Reference: Garey & Johnson A4 SR23.
14
15use crate::registry::{FieldInfo, ProblemSchemaEntry};
16use crate::traits::Problem;
17use crate::types::Min;
18use serde::{Deserialize, Serialize};
19
20inventory::submit! {
21    ProblemSchemaEntry {
22        name: "MinimumInternalMacroDataCompression",
23        display_name: "Minimum Internal Macro Data Compression",
24        aliases: &[],
25        dimensions: &[],
26        module_path: module_path!(),
27        description: "Find minimum-cost self-referencing compression of a string with embedded pointers",
28        fields: &[
29            FieldInfo { name: "alphabet_size", type_name: "usize", description: "Size of the alphabet (symbols indexed 0..alphabet_size)" },
30            FieldInfo { name: "string", type_name: "Vec<usize>", description: "Source string as symbol indices" },
31            FieldInfo { name: "pointer_cost", type_name: "usize", description: "Pointer cost h (each pointer adds h−1 extra to the cost)" },
32        ],
33    }
34}
35
36/// Minimum Internal Macro Data Compression problem.
37///
38/// Given an alphabet of size `k`, a string `s` over `{0, ..., k-1}`, and
39/// a pointer cost `h`, find a compressed string C that minimizes
40/// cost = |C| + (h−1) × (pointer count in C), where C uses itself as both
41/// dictionary and compressed output.
42///
43/// # Representation
44///
45/// The configuration is a vector of `string_len` entries. Each entry is:
46/// - A symbol index in `{0, ..., alphabet_size-1}` (literal)
47/// - `alphabet_size` (end-of-string marker; positions after this are padding)
48/// - A value in `{alphabet_size+1, ..., alphabet_size + string_len}`,
49///   encoding a pointer to C\[v − alphabet_size − 1\] with greedy longest match.
50///
51/// During decoding, pointers are resolved left-to-right. A pointer at position
52/// i referencing position j (where j < i in the decoded output) copies symbols
53/// from the already-decoded output starting at j using greedy longest match.
54///
55/// # Example
56///
57/// ```
58/// use problemreductions::models::misc::MinimumInternalMacroDataCompression;
59/// use problemreductions::{Problem, Solver, BruteForce};
60///
61/// // Alphabet {a, b}, string "abab", pointer cost h=2
62/// let problem = MinimumInternalMacroDataCompression::new(2, vec![0, 1, 0, 1], 2);
63/// let solver = BruteForce::new();
64/// let solution = solver.find_witness(&problem);
65/// assert!(solution.is_some());
66/// ```
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct MinimumInternalMacroDataCompression {
69    alphabet_size: usize,
70    string: Vec<usize>,
71    pointer_cost: usize,
72}
73
74impl MinimumInternalMacroDataCompression {
75    /// Create a new MinimumInternalMacroDataCompression instance.
76    ///
77    /// # Panics
78    ///
79    /// Panics if `alphabet_size` is 0 and the string is non-empty, or if
80    /// any symbol in the string is >= `alphabet_size`, or if `pointer_cost` is 0.
81    pub fn new(alphabet_size: usize, string: Vec<usize>, pointer_cost: usize) -> Self {
82        assert!(
83            alphabet_size > 0 || string.is_empty(),
84            "alphabet_size must be > 0 when the string is non-empty"
85        );
86        assert!(
87            string
88                .iter()
89                .all(|&s| s < alphabet_size || alphabet_size == 0),
90            "all symbols must be less than alphabet_size"
91        );
92        assert!(pointer_cost > 0, "pointer_cost must be positive");
93        Self {
94            alphabet_size,
95            string,
96            pointer_cost,
97        }
98    }
99
100    /// Returns the length of the source string.
101    pub fn string_len(&self) -> usize {
102        self.string.len()
103    }
104
105    /// Returns the alphabet size.
106    pub fn alphabet_size(&self) -> usize {
107        self.alphabet_size
108    }
109
110    /// Returns the pointer cost h.
111    pub fn pointer_cost(&self) -> usize {
112        self.pointer_cost
113    }
114
115    /// Returns the source string.
116    pub fn string(&self) -> &[usize] {
117        &self.string
118    }
119
120    /// Decode the compressed string C and return the decoded string,
121    /// the active length of C, and the pointer count.
122    /// Returns None if decoding fails (invalid pointer, circular reference, etc.).
123    fn decode(&self, config: &[usize]) -> Option<(Vec<usize>, usize, usize)> {
124        let n = self.string.len();
125        let k = self.alphabet_size;
126        let eos = k; // end-of-string marker
127
128        // Find active length: prefix before first end-of-string marker
129        let active_len = config.iter().position(|&v| v == eos).unwrap_or(n);
130
131        // Verify contiguous: all after first EOS must be EOS or padding
132        for &v in &config[active_len..] {
133            if v != eos {
134                return None;
135            }
136        }
137
138        // Decode left-to-right. A pointer at compressed position c_idx
139        // referencing C[j] copies from the decoded output that existed
140        // before this pointer (no overlapping/runaway copy).
141        let mut decoded = Vec::new();
142        let mut pointer_count: usize = 0;
143
144        for &v in &config[..active_len] {
145            if v < k {
146                // Literal symbol
147                decoded.push(v);
148            } else if v > k {
149                // Pointer: references C[ref_pos] in the compressed string
150                let ref_pos = v - k - 1;
151                if ref_pos >= decoded.len() {
152                    return None; // pointer references undecoded position
153                }
154                // Greedy longest match from decoded[ref_pos..copy_start]
155                // (only pre-existing decoded content, no overlapping copy)
156                let copy_start = decoded.len();
157                let mut matched = 0;
158                while copy_start + matched < n {
159                    let src_idx = ref_pos + matched;
160                    if src_idx >= copy_start {
161                        break; // cannot read beyond pre-existing content
162                    }
163                    if decoded[src_idx] != self.string[copy_start + matched] {
164                        break;
165                    }
166                    decoded.push(decoded[src_idx]);
167                    matched += 1;
168                }
169                if matched == 0 {
170                    return None; // pointer must copy at least one symbol
171                }
172                pointer_count += 1;
173            } else {
174                // v == eos, but we filtered those out above
175                return None;
176            }
177        }
178
179        Some((decoded, active_len, pointer_count))
180    }
181}
182
183impl Problem for MinimumInternalMacroDataCompression {
184    const NAME: &'static str = "MinimumInternalMacroDataCompression";
185    type Value = Min<usize>;
186
187    fn variant() -> Vec<(&'static str, &'static str)> {
188        crate::variant_params![]
189    }
190
191    fn dims(&self) -> Vec<usize> {
192        let n = self.string.len();
193        let domain = self.alphabet_size + n + 1; // literals + EOS + pointers
194        vec![domain; n]
195    }
196
197    fn evaluate(&self, config: &[usize]) -> Min<usize> {
198        let n = self.string.len();
199        if config.len() != n {
200            return Min(None);
201        }
202
203        // Handle empty string
204        if n == 0 {
205            return Min(Some(0));
206        }
207
208        match self.decode(config) {
209            Some((decoded, active_len, pointer_count)) => {
210                if decoded != self.string {
211                    Min(None)
212                } else {
213                    let cost = active_len + (self.pointer_cost - 1) * pointer_count;
214                    Min(Some(cost))
215                }
216            }
217            None => Min(None),
218        }
219    }
220}
221
222crate::declare_variants! {
223    default MinimumInternalMacroDataCompression => "(alphabet_size + string_len + 1) ^ string_len",
224}
225
226#[cfg(feature = "example-db")]
227pub(crate) fn canonical_model_example_specs() -> Vec<crate::example_db::specs::ModelExampleSpec> {
228    // Issue #442 example: alphabet {a,b,c} (3), s="abcabcabc" (9), h=2
229    // Optimal: C = [a, b, c, ptr(0), ptr(0), EOS, EOS, EOS, EOS]
230    //   active_len = 5, pointers = 2
231    //   cost = 5 + (2-1)*2 = 7
232    //
233    // Config encoding:
234    //   alphabet_size = 3, string_len = 9, domain = 3+9+1 = 13
235    //   Literals: 0=a, 1=b, 2=c
236    //   EOS: 3
237    //   Pointers: 4=ptr(C[0]), 5=ptr(C[1]), ...
238    let s: Vec<usize> = vec![0, 1, 2, 0, 1, 2, 0, 1, 2];
239    let optimal_config = vec![
240        0, 1, 2, // literals a, b, c
241        4, // ptr(C[0]) -> greedy "abc"
242        4, // ptr(C[0]) -> greedy "abc"
243        3, 3, 3, 3, // EOS padding
244    ];
245    vec![crate::example_db::specs::ModelExampleSpec {
246        id: "minimum_internal_macro_data_compression",
247        instance: Box::new(MinimumInternalMacroDataCompression::new(3, s, 2)),
248        optimal_config,
249        optimal_value: serde_json::json!(7),
250    }]
251}
252
253#[cfg(test)]
254#[path = "../../unit_tests/models/misc/minimum_internal_macro_data_compression.rs"]
255mod tests;