problemreductions/models/misc/minimum_internal_macro_data_compression.rs
1//! Minimum Internal Macro Data Compression problem implementation.
2//!
3//! Given an alphabet Σ, a string s ∈ Σ*, and a pointer cost h,
4//! find a single compressed string C ∈ (Σ ∪ {pointers})* minimizing the cost
5//! |C| + (h−1) × (number of pointer occurrences in C),
6//! such that s can be obtained from C by resolving all pointer references
7//! within C itself (left-to-right, greedy longest match).
8//!
9//! Unlike external macro compression, there is no separate dictionary — the
10//! compressed string C serves as both dictionary and output.
11//!
12//! This problem is NP-hard (Storer, 1977; Storer & Szymanski, 1978).
13//! Reference: Garey & Johnson A4 SR23.
14
15use crate::registry::{FieldInfo, ProblemSchemaEntry};
16use crate::traits::Problem;
17use crate::types::Min;
18use serde::{Deserialize, Serialize};
19
20inventory::submit! {
21 ProblemSchemaEntry {
22 name: "MinimumInternalMacroDataCompression",
23 display_name: "Minimum Internal Macro Data Compression",
24 aliases: &[],
25 dimensions: &[],
26 module_path: module_path!(),
27 description: "Find minimum-cost self-referencing compression of a string with embedded pointers",
28 fields: &[
29 FieldInfo { name: "alphabet_size", type_name: "usize", description: "Size of the alphabet (symbols indexed 0..alphabet_size)" },
30 FieldInfo { name: "string", type_name: "Vec<usize>", description: "Source string as symbol indices" },
31 FieldInfo { name: "pointer_cost", type_name: "usize", description: "Pointer cost h (each pointer adds h−1 extra to the cost)" },
32 ],
33 }
34}
35
36/// Minimum Internal Macro Data Compression problem.
37///
38/// Given an alphabet of size `k`, a string `s` over `{0, ..., k-1}`, and
39/// a pointer cost `h`, find a compressed string C that minimizes
40/// cost = |C| + (h−1) × (pointer count in C), where C uses itself as both
41/// dictionary and compressed output.
42///
43/// # Representation
44///
45/// The configuration is a vector of `string_len` entries. Each entry is:
46/// - A symbol index in `{0, ..., alphabet_size-1}` (literal)
47/// - `alphabet_size` (end-of-string marker; positions after this are padding)
48/// - A value in `{alphabet_size+1, ..., alphabet_size + string_len}`,
49/// encoding a pointer to C\[v − alphabet_size − 1\] with greedy longest match.
50///
51/// During decoding, pointers are resolved left-to-right. A pointer at position
52/// i referencing position j (where j < i in the decoded output) copies symbols
53/// from the already-decoded output starting at j using greedy longest match.
54///
55/// # Example
56///
57/// ```
58/// use problemreductions::models::misc::MinimumInternalMacroDataCompression;
59/// use problemreductions::{Problem, Solver, BruteForce};
60///
61/// // Alphabet {a, b}, string "abab", pointer cost h=2
62/// let problem = MinimumInternalMacroDataCompression::new(2, vec![0, 1, 0, 1], 2);
63/// let solver = BruteForce::new();
64/// let solution = solver.find_witness(&problem);
65/// assert!(solution.is_some());
66/// ```
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct MinimumInternalMacroDataCompression {
69 alphabet_size: usize,
70 string: Vec<usize>,
71 pointer_cost: usize,
72}
73
74impl MinimumInternalMacroDataCompression {
75 /// Create a new MinimumInternalMacroDataCompression instance.
76 ///
77 /// # Panics
78 ///
79 /// Panics if `alphabet_size` is 0 and the string is non-empty, or if
80 /// any symbol in the string is >= `alphabet_size`, or if `pointer_cost` is 0.
81 pub fn new(alphabet_size: usize, string: Vec<usize>, pointer_cost: usize) -> Self {
82 assert!(
83 alphabet_size > 0 || string.is_empty(),
84 "alphabet_size must be > 0 when the string is non-empty"
85 );
86 assert!(
87 string
88 .iter()
89 .all(|&s| s < alphabet_size || alphabet_size == 0),
90 "all symbols must be less than alphabet_size"
91 );
92 assert!(pointer_cost > 0, "pointer_cost must be positive");
93 Self {
94 alphabet_size,
95 string,
96 pointer_cost,
97 }
98 }
99
100 /// Returns the length of the source string.
101 pub fn string_len(&self) -> usize {
102 self.string.len()
103 }
104
105 /// Returns the alphabet size.
106 pub fn alphabet_size(&self) -> usize {
107 self.alphabet_size
108 }
109
110 /// Returns the pointer cost h.
111 pub fn pointer_cost(&self) -> usize {
112 self.pointer_cost
113 }
114
115 /// Returns the source string.
116 pub fn string(&self) -> &[usize] {
117 &self.string
118 }
119
120 /// Decode the compressed string C and return the decoded string,
121 /// the active length of C, and the pointer count.
122 /// Returns None if decoding fails (invalid pointer, circular reference, etc.).
123 fn decode(&self, config: &[usize]) -> Option<(Vec<usize>, usize, usize)> {
124 let n = self.string.len();
125 let k = self.alphabet_size;
126 let eos = k; // end-of-string marker
127
128 // Find active length: prefix before first end-of-string marker
129 let active_len = config.iter().position(|&v| v == eos).unwrap_or(n);
130
131 // Verify contiguous: all after first EOS must be EOS or padding
132 for &v in &config[active_len..] {
133 if v != eos {
134 return None;
135 }
136 }
137
138 // Decode left-to-right. A pointer at compressed position c_idx
139 // referencing C[j] copies from the decoded output that existed
140 // before this pointer (no overlapping/runaway copy).
141 let mut decoded = Vec::new();
142 let mut pointer_count: usize = 0;
143
144 for &v in &config[..active_len] {
145 if v < k {
146 // Literal symbol
147 decoded.push(v);
148 } else if v > k {
149 // Pointer: references C[ref_pos] in the compressed string
150 let ref_pos = v - k - 1;
151 if ref_pos >= decoded.len() {
152 return None; // pointer references undecoded position
153 }
154 // Greedy longest match from decoded[ref_pos..copy_start]
155 // (only pre-existing decoded content, no overlapping copy)
156 let copy_start = decoded.len();
157 let mut matched = 0;
158 while copy_start + matched < n {
159 let src_idx = ref_pos + matched;
160 if src_idx >= copy_start {
161 break; // cannot read beyond pre-existing content
162 }
163 if decoded[src_idx] != self.string[copy_start + matched] {
164 break;
165 }
166 decoded.push(decoded[src_idx]);
167 matched += 1;
168 }
169 if matched == 0 {
170 return None; // pointer must copy at least one symbol
171 }
172 pointer_count += 1;
173 } else {
174 // v == eos, but we filtered those out above
175 return None;
176 }
177 }
178
179 Some((decoded, active_len, pointer_count))
180 }
181}
182
183impl Problem for MinimumInternalMacroDataCompression {
184 const NAME: &'static str = "MinimumInternalMacroDataCompression";
185 type Value = Min<usize>;
186
187 fn variant() -> Vec<(&'static str, &'static str)> {
188 crate::variant_params![]
189 }
190
191 fn dims(&self) -> Vec<usize> {
192 let n = self.string.len();
193 let domain = self.alphabet_size + n + 1; // literals + EOS + pointers
194 vec![domain; n]
195 }
196
197 fn evaluate(&self, config: &[usize]) -> Min<usize> {
198 let n = self.string.len();
199 if config.len() != n {
200 return Min(None);
201 }
202
203 // Handle empty string
204 if n == 0 {
205 return Min(Some(0));
206 }
207
208 match self.decode(config) {
209 Some((decoded, active_len, pointer_count)) => {
210 if decoded != self.string {
211 Min(None)
212 } else {
213 let cost = active_len + (self.pointer_cost - 1) * pointer_count;
214 Min(Some(cost))
215 }
216 }
217 None => Min(None),
218 }
219 }
220}
221
222crate::declare_variants! {
223 default MinimumInternalMacroDataCompression => "(alphabet_size + string_len + 1) ^ string_len",
224}
225
226#[cfg(feature = "example-db")]
227pub(crate) fn canonical_model_example_specs() -> Vec<crate::example_db::specs::ModelExampleSpec> {
228 // Issue #442 example: alphabet {a,b,c} (3), s="abcabcabc" (9), h=2
229 // Optimal: C = [a, b, c, ptr(0), ptr(0), EOS, EOS, EOS, EOS]
230 // active_len = 5, pointers = 2
231 // cost = 5 + (2-1)*2 = 7
232 //
233 // Config encoding:
234 // alphabet_size = 3, string_len = 9, domain = 3+9+1 = 13
235 // Literals: 0=a, 1=b, 2=c
236 // EOS: 3
237 // Pointers: 4=ptr(C[0]), 5=ptr(C[1]), ...
238 let s: Vec<usize> = vec![0, 1, 2, 0, 1, 2, 0, 1, 2];
239 let optimal_config = vec![
240 0, 1, 2, // literals a, b, c
241 4, // ptr(C[0]) -> greedy "abc"
242 4, // ptr(C[0]) -> greedy "abc"
243 3, 3, 3, 3, // EOS padding
244 ];
245 vec![crate::example_db::specs::ModelExampleSpec {
246 id: "minimum_internal_macro_data_compression",
247 instance: Box::new(MinimumInternalMacroDataCompression::new(3, s, 2)),
248 optimal_config,
249 optimal_value: serde_json::json!(7),
250 }]
251}
252
253#[cfg(test)]
254#[path = "../../unit_tests/models/misc/minimum_internal_macro_data_compression.rs"]
255mod tests;