problemreductions/models/misc/minimum_external_macro_data_compression.rs
1//! Minimum External Macro Data Compression problem implementation.
2//!
3//! Given an alphabet Sigma, a string s in Sigma*, and a pointer cost h,
4//! find a dictionary string D and compressed string C minimizing the total
5//! cost |D| + |C| + (h-1) * (number of pointer occurrences in D and C),
6//! such that s can be reconstructed from C by replacing pointers with their
7//! referenced substrings of D.
8//!
9//! The configuration uses 2*|s| slots: |s| slots for D (dictionary) and |s|
10//! slots for C (compressed string). D-slots use alphabet symbols or empty.
11//! C-slots use alphabet symbols, pointers into D (start, len), or empty.
12//! D is restricted to be pointer-free (pure alphabet string).
13//!
14//! This problem is NP-hard (Storer, 1977; Storer & Szymanski, 1978).
15//! Reference: Garey & Johnson A4 SR22.
16
17use crate::registry::{FieldInfo, ProblemSchemaEntry};
18use crate::traits::Problem;
19use crate::types::Min;
20use serde::{Deserialize, Serialize};
21
22inventory::submit! {
23 ProblemSchemaEntry {
24 name: "MinimumExternalMacroDataCompression",
25 display_name: "Minimum External Macro Data Compression",
26 aliases: &[],
27 dimensions: &[],
28 module_path: module_path!(),
29 description: "Find minimum-cost compression using an external dictionary and compressed string with pointers",
30 fields: &[
31 FieldInfo { name: "alphabet_size", type_name: "usize", description: "Size of the alphabet (symbols indexed 0..alphabet_size)" },
32 FieldInfo { name: "string", type_name: "Vec<usize>", description: "Source string as symbol indices" },
33 FieldInfo { name: "pointer_cost", type_name: "usize", description: "Pointer cost h (each pointer contributes h to the cost)" },
34 ],
35 }
36}
37
38/// Minimum External Macro Data Compression problem.
39///
40/// Given an alphabet of size `k`, a string `s` over `{0, ..., k-1}`, and
41/// a pointer cost `h`, find dictionary string D and compressed string C
42/// that minimize cost = |D| + |C| + (h-1) * (pointer count in C).
43///
44/// # Representation
45///
46/// The configuration is a vector of `2 * string_length` entries:
47/// - First `string_length` entries are D-slots: each is a symbol index
48/// in `{0, ..., alphabet_size-1}` or `alphabet_size` (empty/unused).
49/// - Next `string_length` entries are C-slots: each is:
50/// - A symbol index in `{0, ..., alphabet_size-1}` (literal)
51/// - `alphabet_size` (empty/unused)
52/// - A value in `{alphabet_size+1, ..., alphabet_size + |s|*(|s|+1)/2}`
53/// encoding a pointer (start, len) into D.
54///
55/// D is the prefix of non-empty D-slots. C is the prefix of non-empty C-slots.
56/// The cost is |D| + |C| + (h-1) * (number of pointer symbols in C).
57///
58/// # Example
59///
60/// ```
61/// use problemreductions::models::misc::MinimumExternalMacroDataCompression;
62/// use problemreductions::{Problem, Solver, BruteForce};
63///
64/// // Alphabet {a, b}, string "abab", pointer cost h=2
65/// let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1, 0, 1], 2);
66/// let solver = BruteForce::new();
67/// let solution = solver.find_witness(&problem);
68/// assert!(solution.is_some());
69/// ```
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct MinimumExternalMacroDataCompression {
72 alphabet_size: usize,
73 string: Vec<usize>,
74 pointer_cost: usize,
75}
76
77impl MinimumExternalMacroDataCompression {
78 /// Create a new MinimumExternalMacroDataCompression instance.
79 ///
80 /// # Panics
81 ///
82 /// Panics if `alphabet_size` is 0 and the string is non-empty, or if
83 /// any symbol in the string is >= `alphabet_size`, or if `pointer_cost` is 0.
84 pub fn new(alphabet_size: usize, string: Vec<usize>, pointer_cost: usize) -> Self {
85 assert!(
86 alphabet_size > 0 || string.is_empty(),
87 "alphabet_size must be > 0 when the string is non-empty"
88 );
89 assert!(
90 string
91 .iter()
92 .all(|&s| s < alphabet_size || alphabet_size == 0),
93 "all symbols must be less than alphabet_size"
94 );
95 assert!(pointer_cost > 0, "pointer_cost must be positive");
96 Self {
97 alphabet_size,
98 string,
99 pointer_cost,
100 }
101 }
102
103 /// Returns the length of the source string.
104 pub fn string_length(&self) -> usize {
105 self.string.len()
106 }
107
108 /// Returns the alphabet size.
109 pub fn alphabet_size(&self) -> usize {
110 self.alphabet_size
111 }
112
113 /// Returns the pointer cost h.
114 pub fn pointer_cost(&self) -> usize {
115 self.pointer_cost
116 }
117
118 /// Returns the source string.
119 pub fn string(&self) -> &[usize] {
120 &self.string
121 }
122
123 /// Returns the number of valid pointers into D (|s|*(|s|+1)/2).
124 fn num_pointers(&self) -> usize {
125 let n = self.string.len();
126 n * (n + 1) / 2
127 }
128
129 /// Returns the C-slot domain size: alphabet_size + 1 (empty) + num_pointers.
130 fn c_domain_size(&self) -> usize {
131 self.alphabet_size + 1 + self.num_pointers()
132 }
133
134 /// Decode a pointer index (offset from alphabet_size+1) into (start, len)
135 /// in the dictionary. Pointers are enumerated as:
136 /// index 0 -> (0, 1), 1 -> (0, 2), ..., n-1 -> (0, n),
137 /// n -> (1, 1), n+1 -> (1, 2), ..., etc.
138 fn decode_pointer(&self, ptr_idx: usize) -> Option<(usize, usize)> {
139 let n = self.string.len();
140 // Enumerate (start, len) pairs where 0 <= start < n, 1 <= len <= n - start
141 let mut idx = 0;
142 for start in 0..n {
143 let max_len = n - start;
144 if ptr_idx < idx + max_len {
145 let len = ptr_idx - idx + 1;
146 return Some((start, len));
147 }
148 idx += max_len;
149 }
150 None
151 }
152}
153
154impl Problem for MinimumExternalMacroDataCompression {
155 const NAME: &'static str = "MinimumExternalMacroDataCompression";
156 type Value = Min<usize>;
157
158 fn variant() -> Vec<(&'static str, &'static str)> {
159 crate::variant_params![]
160 }
161
162 fn dims(&self) -> Vec<usize> {
163 let n = self.string.len();
164 let d_domain = self.alphabet_size + 1; // symbols + empty
165 let c_domain = self.c_domain_size(); // symbols + empty + pointers
166 let mut dims = vec![d_domain; n]; // D-slots
167 dims.extend(vec![c_domain; n]); // C-slots
168 dims
169 }
170
171 fn evaluate(&self, config: &[usize]) -> Min<usize> {
172 let n = self.string.len();
173 if config.len() != 2 * n {
174 return Min(None);
175 }
176
177 // Handle empty string case
178 if n == 0 {
179 return Min(Some(0));
180 }
181
182 let empty_d = self.alphabet_size; // empty marker for D-slots
183 let empty_c = self.alphabet_size; // empty marker for C-slots
184
185 // Decode D: prefix of non-empty D-slots
186 let d_slots = &config[..n];
187 let d_len = d_slots.iter().position(|&v| v == empty_d).unwrap_or(n);
188
189 // Verify contiguous: all after first empty must be empty
190 for &v in &d_slots[d_len..] {
191 if v != empty_d {
192 return Min(None);
193 }
194 }
195
196 // Verify D symbols are valid alphabet symbols
197 let d_str: Vec<usize> = d_slots[..d_len].to_vec();
198 if d_str.iter().any(|&v| v >= self.alphabet_size) {
199 return Min(None);
200 }
201
202 // Decode C: prefix of non-empty C-slots
203 let c_slots = &config[n..];
204 let c_len = c_slots.iter().position(|&v| v == empty_c).unwrap_or(n);
205
206 // Verify contiguous: all after first empty must be empty
207 for &v in &c_slots[c_len..] {
208 if v != empty_c {
209 return Min(None);
210 }
211 }
212
213 // Decode C into a sequence of symbols, counting pointers
214 let mut decoded = Vec::new();
215 let mut pointer_count: usize = 0;
216
217 for &v in &c_slots[..c_len] {
218 if v < self.alphabet_size {
219 // Literal symbol
220 decoded.push(v);
221 } else if v > self.alphabet_size {
222 // Pointer into D
223 let ptr_idx = v - (self.alphabet_size + 1);
224 if let Some((start, len)) = self.decode_pointer(ptr_idx) {
225 // Pointer must reference valid portion of D
226 if start + len > d_len {
227 return Min(None);
228 }
229 decoded.extend_from_slice(&d_str[start..start + len]);
230 pointer_count += 1;
231 } else {
232 return Min(None);
233 }
234 } else {
235 // v == empty_c, but we already filtered those out
236 return Min(None);
237 }
238 }
239
240 // Check decoded string matches the source string
241 if decoded != self.string {
242 return Min(None);
243 }
244
245 // Compute cost: |D| + |C| + (h-1) * pointer_count
246 let cost = d_len + c_len + (self.pointer_cost - 1) * pointer_count;
247 Min(Some(cost))
248 }
249}
250
251crate::declare_variants! {
252 default MinimumExternalMacroDataCompression => "(alphabet_size + 1) ^ string_length * (alphabet_size + 1 + string_length * (string_length + 1) / 2) ^ string_length",
253}
254
255#[cfg(feature = "example-db")]
256pub(crate) fn canonical_model_example_specs() -> Vec<crate::example_db::specs::ModelExampleSpec> {
257 // Issue #441 example: alphabet {a,b,c,d,e,f} (6), s="abcdefabcdefabcdef" (18), h=2.
258 // Optimal: D="abcdef"(6), C = ptr(0,6) ptr(0,6) ptr(0,6), cost = 6+3+(2-1)*3 = 12.
259 // Solved via ILP reduction (brute force infeasible at this size).
260 //
261 // Config encoding (2*18 = 36 slots):
262 // D-slots: [0,1,2,3,4,5, 6,6,...,6] (6 symbols + 12 empty, empty=alphabet_size=6)
263 // C-slots: [ptr(0,6), ptr(0,6), ptr(0,6), 6,6,...,6] (3 pointers + 15 empty)
264 // ptr(0,6) index: start=0, len=6 → index 5 → encoded as 6+1+5 = 12
265 let s: Vec<usize> = (0..6).cycle().take(18).collect();
266 let mut optimal_config = vec![0, 1, 2, 3, 4, 5];
267 optimal_config.extend(vec![6; 12]); // empty D-slots
268 optimal_config.extend(vec![12, 12, 12]); // 3 pointers to D[0..6]
269 optimal_config.extend(vec![6; 15]); // empty C-slots
270 vec![crate::example_db::specs::ModelExampleSpec {
271 id: "minimum_external_macro_data_compression",
272 instance: Box::new(MinimumExternalMacroDataCompression::new(6, s, 2)),
273 optimal_config,
274 optimal_value: serde_json::json!(12),
275 }]
276}
277
278#[cfg(test)]
279#[path = "../../unit_tests/models/misc/minimum_external_macro_data_compression.rs"]
280mod tests;