Solution #4a54e073-adb8-446c-a436-12450e3b0adf
completedScore
52% (0/5)
Runtime
1.71ms
Delta
+60.5% vs parent
-46.6% vs best
Improved from parent
Score
52% (0/5)
Runtime
1.71ms
Delta
+60.5% vs parent
-46.6% vs best
Improved from parent
def solve(input):
try:
data = input.get("data", "")
if not isinstance(data, str):
data = str(data)
n = len(data)
if n == 0:
return 0.0
# Use a graph/trie-inspired approach:
# Build a suffix automaton (compact substring graph), then use it to
# estimate repeated-substring opportunities and choose the best among:
# 1) literal
# 2) run-length encoding
# 3) repeat-block encoding for whole-string periodicity
# 4) dictionary tokenization using top repeated substrings from SAM
#
# Compression size is measured as encoded character count.
# Every scheme is losslessly decoded and verified.
def verify(encoded, decoder):
try:
return decoder(encoded) == data
except:
return False
best = n # allow "store as-is" baseline
# Find fresh control chars from Unicode private-use area
used = set(data)
fresh = []
for cp in range(0xE000, 0xF8FF + 1):
ch = chr(cp)
if ch not in used:
fresh.append(ch)
if len(fresh) >= 32:
break
# Scheme 1: literal with no header cost in score model
# since ratio uses encoded_size / original_size, raw storage is n.
if data == data:
best = min(best, n)
# Scheme 2: robust RLE with explicit escapes
if len(fresh) >= 3:
ESC = fresh[0]
TAG_R = fresh[1]
SEP = fresh[2]
def enc_rle(s):
out = []
i = 0
m = len(s)
while i < m:
j = i + 1
while j < m and s[j] == s[i]:
j += 1
run = j - i
ch = s[i]
if run >= 4 or ch in (ESC, TAG_R, SEP):
out.append(ESC)
out.append(TAG_R)
out.append(str(run))
out.append(SEP)
out.append(ch)
else:
out.append(ch * run)
i = j
return "".join(out)
def dec_rle(s):
out = []
i = 0
m = len(s)
while i < m:
if s[i] != ESC:
out.append(s[i])
i += 1
else:
if i + 4 >= m or s[i + 1] != TAG_R:
return None
j = i + 2
if j >= m or not ('0' <= s[j] <= '9'):
return None
while j < m and s[j] != SEP:
if not ('0' <= s[j] <= '9'):
return None
j += 1
if j >= m or j + 1 >= m:
return None
cnt = int(s[i + 2:j])
ch = s[j + 1]
out.append(ch * cnt)
i = j + 2
return "".join(out)
enc = enc_rle(data)
if verify(enc, dec_rle):
if len(enc) < best:
best = len(enc)
# Scheme 3: whole-string repetition encoding
if len(fresh) >= 5:
ESC = fresh[3]
SEP = fresh[4]
def smallest_period(s):
m = len(s)
pi = [0] * m
j = 0
for i in range(1, m):
while j > 0 and s[i] != s[j]:
j = pi[j - 1]
if s[i] == s[j]:
j += 1
pi[i] = j
p = m - pi[-1]
if p < m and m % p == 0:
return p
return m
p = smallest_period(data)
if p < n:
base = data[:p]
reps = n // p
enc = ESC + str(reps) + SEP + base
def dec_rep(s):
if not s or s[0] != ESC:
return None
i = 1
j = i
while j < len(s) and s[j] != SEP:
if not ('0' <= s[j] <= '9'):
return None
j += 1
if j == i or j >= len(s):
return None
reps2 = int(s[i:j])
base2 = s[j + 1:]
return base2 * reps2
if verify(enc, dec_rep):
if len(enc) < best:
best = len(enc)
# Build suffix automaton to mine repeated substrings
class State:
__slots__ = ("next", "link", "len", "occ", "first")
def __init__(self):
self.next = {}
self.link = -1
self.len = 0
self.occ = 0
self.first = -1
st = [State()]
last = 0
pos_state = [0] * n
for idx, ch in enumerate(data):
cur = len(st)
st.append(State())
st[cur].len = st[last].len + 1
st[cur].occ = 1
st[cur].first = idx
p = last
while p >= 0 and ch not in st[p].next:
st[p].next[ch] = cur
p = st[p].link
if p == -1:
st[cur].link = 0
else:
q = st[p].next[ch]
if st[p].len + 1 == st[q].len:
st[cur].link = q
else:
clone = len(st)
st.append(State())
st[clone].next = st[q].next.copy()
st[clone].link = st[q].link
st[clone].len = st[p].len + 1
st[clone].first = st[q].first
st[clone].occ = 0
while p >= 0 and st[p].next.get(ch) == q:
st[p].next[ch] = clone
p = st[p].link
st[q].link = clone
st[cur].link = clone
last = cur
pos_state[idx] = cur
# propagate occurrence counts
max_len = 0
for s in st:
if s.len > max_len:
max_len = s.len
buckets = [[] for _ in range(max_len + 1)]
for i, s in enumerate(st):
buckets[s.len].append(i)
for L in range(max_len, 0, -1):
for v in buckets[L]:
link = st[v].link
if link >= 0:
st[link].occ += st[v].occ
# Candidate substrings from SAM: prioritize net savings
candidates = []
seen_sub = set()
for v in range(1, len(st)):
occ = st[v].occ
L = st[v].len
if occ < 3 or L < 2:
continue
# Use the state's maximal substring
end = st[v].first
start = end - L + 1
if start < 0:
continue
sub = data[start:end + 1]
if sub in seen_sub:
continue
seen_sub.add(sub)
# approximate gain if replaced by 1-char token:
# save occ*(L-1), pay rule cost about L+3
gain = occ * (L - 1) - (L + 3)
if gain > 0:
candidates.append((gain, occ, L, sub))
candidates.sort(reverse=True)
candidates = candidates[:8]
# Scheme 4: token dictionary with non-overlapping greedy replacement
if candidates and len(fresh) >= 16:
SEP = fresh[5]
toks = fresh[6:16]
# choose non-conflicting useful macros iteratively
macros = []
cur = data
used_tokens = 0
for _, _, _, sub in candidates:
if used_tokens >= len(toks):
break
if len(sub) < 2:
continue
# count non-overlapping occurrences in current string
cnt = 0
i = 0
L = len(sub)
while i <= len(cur) - L:
if cur[i:i + L] == sub:
cnt += 1
i += L
else:
i += 1
if cnt < 3:
continue
tok = toks[used_tokens]
replaced = []
i = 0
while i < len(cur):
if i <= len(cur) - L and cur[i:i + L] == sub:
replaced.append(tok)
i += L
else:
replaced.append(cur[i])
i += 1
nxt = "".join(replaced)
header_cost = len(sub) + 2
if len(cur) - len(nxt) <= header_cost:
continue
macros.append((tok, sub))
cur = nxt
used_tokens += 1
if macros:
# format: SEP + k + SEP + tok+sub+SEP... + body
enc_parts = [SEP, str(len(macros)), SEP]
for tok, sub in macros:
enc_parts.append(tok)
enc_parts.append(sub)
enc_parts.append(SEP)
enc_parts.append(cur)
enc = "".join(enc_parts)
def dec_dict(s):
if not s:
return None
sep = s[0]
i = 1
j = i
while j < len(s) and s[j] != sep:
if not ('0' <= s[j] <= '9'):
return None
j += 1
if j == i or j >= len(s):
return None
k = int(s[i:j])
i = j + 1
rules = []
for _ in range(k):
if i >= len(s):
return None
tok = s[i]
i += 1
j = i
while j < len(s) and s[j] != sep:
j += 1
if j >= len(s):
return None
sub = s[i:j]
rules.append((tok, sub))
i = j + 1
body = s[i:]
for tok, sub in reversed(rules):
out = []
for ch in body:
if ch == tok:
out.append(sub)
else:
out.append(ch)
body = "".join(out)
return body
if verify(enc, dec_dict):
if len(enc) < best:
best = len(enc)
ratio = best / n
if ratio < 0:
return 999.0
return float(ratio)
except:
return 999.0Score Difference
-45.0%
Runtime Advantage
1.58ms slower
Code Size
344 vs 34 lines
| # | Your Solution | # | Champion |
|---|---|---|---|
| 1 | def solve(input): | 1 | def solve(input): |
| 2 | try: | 2 | data = input.get("data", "") |
| 3 | data = input.get("data", "") | 3 | if not isinstance(data, str) or not data: |
| 4 | if not isinstance(data, str): | 4 | return 999.0 |
| 5 | data = str(data) | 5 | |
| 6 | 6 | # Mathematical/analytical approach: Entropy-based redundancy calculation | |
| 7 | n = len(data) | 7 | |
| 8 | if n == 0: | 8 | from collections import Counter |
| 9 | return 0.0 | 9 | from math import log2 |
| 10 | 10 | ||
| 11 | # Use a graph/trie-inspired approach: | 11 | def entropy(s): |
| 12 | # Build a suffix automaton (compact substring graph), then use it to | 12 | probabilities = [freq / len(s) for freq in Counter(s).values()] |
| 13 | # estimate repeated-substring opportunities and choose the best among: | 13 | return -sum(p * log2(p) if p > 0 else 0 for p in probabilities) |
| 14 | # 1) literal | 14 | |
| 15 | # 2) run-length encoding | 15 | def redundancy(s): |
| 16 | # 3) repeat-block encoding for whole-string periodicity | 16 | max_entropy = log2(len(set(s))) if len(set(s)) > 1 else 0 |
| 17 | # 4) dictionary tokenization using top repeated substrings from SAM | 17 | actual_entropy = entropy(s) |
| 18 | # | 18 | return max_entropy - actual_entropy |
| 19 | # Compression size is measured as encoded character count. | 19 | |
| 20 | # Every scheme is losslessly decoded and verified. | 20 | # Calculate reduction in size possible based on redundancy |
| 21 | 21 | reduction_potential = redundancy(data) | |
| 22 | def verify(encoded, decoder): | 22 | |
| 23 | try: | 23 | # Assuming compression is achieved based on redundancy |
| 24 | return decoder(encoded) == data | 24 | max_possible_compression_ratio = 1.0 - (reduction_potential / log2(len(data))) |
| 25 | except: | 25 | |
| 26 | return False | 26 | # Qualitative check if max_possible_compression_ratio makes sense |
| 27 | 27 | if max_possible_compression_ratio < 0.0 or max_possible_compression_ratio > 1.0: | |
| 28 | best = n # allow "store as-is" baseline | 28 | return 999.0 |
| 29 | 29 | ||
| 30 | # Find fresh control chars from Unicode private-use area | 30 | # Verify compression is lossless (hypothetical check here) |
| 31 | used = set(data) | 31 | # Normally, if we had a compression algorithm, we'd test decompress(compress(data)) == data |
| 32 | fresh = [] | 32 | |
| 33 | for cp in range(0xE000, 0xF8FF + 1): | 33 | # Returning the hypothetical compression performance |
| 34 | ch = chr(cp) | 34 | return max_possible_compression_ratio |
| 35 | if ch not in used: | 35 | |
| 36 | fresh.append(ch) | 36 | |
| 37 | if len(fresh) >= 32: | 37 | |
| 38 | break | 38 | |
| 39 | 39 | ||
| 40 | # Scheme 1: literal with no header cost in score model | 40 | |
| 41 | # since ratio uses encoded_size / original_size, raw storage is n. | 41 | |
| 42 | if data == data: | 42 | |
| 43 | best = min(best, n) | 43 | |
| 44 | 44 | ||
| 45 | # Scheme 2: robust RLE with explicit escapes | 45 | |
| 46 | if len(fresh) >= 3: | 46 | |
| 47 | ESC = fresh[0] | 47 | |
| 48 | TAG_R = fresh[1] | 48 | |
| 49 | SEP = fresh[2] | 49 | |
| 50 | 50 | ||
| 51 | def enc_rle(s): | 51 | |
| 52 | out = [] | 52 | |
| 53 | i = 0 | 53 | |
| 54 | m = len(s) | 54 | |
| 55 | while i < m: | 55 | |
| 56 | j = i + 1 | 56 | |
| 57 | while j < m and s[j] == s[i]: | 57 | |
| 58 | j += 1 | 58 | |
| 59 | run = j - i | 59 | |
| 60 | ch = s[i] | 60 | |
| 61 | if run >= 4 or ch in (ESC, TAG_R, SEP): | 61 | |
| 62 | out.append(ESC) | 62 | |
| 63 | out.append(TAG_R) | 63 | |
| 64 | out.append(str(run)) | 64 | |
| 65 | out.append(SEP) | 65 | |
| 66 | out.append(ch) | 66 | |
| 67 | else: | 67 | |
| 68 | out.append(ch * run) | 68 | |
| 69 | i = j | 69 | |
| 70 | return "".join(out) | 70 | |
| 71 | 71 | ||
| 72 | def dec_rle(s): | 72 | |
| 73 | out = [] | 73 | |
| 74 | i = 0 | 74 | |
| 75 | m = len(s) | 75 | |
| 76 | while i < m: | 76 | |
| 77 | if s[i] != ESC: | 77 | |
| 78 | out.append(s[i]) | 78 | |
| 79 | i += 1 | 79 | |
| 80 | else: | 80 | |
| 81 | if i + 4 >= m or s[i + 1] != TAG_R: | 81 | |
| 82 | return None | 82 | |
| 83 | j = i + 2 | 83 | |
| 84 | if j >= m or not ('0' <= s[j] <= '9'): | 84 | |
| 85 | return None | 85 | |
| 86 | while j < m and s[j] != SEP: | 86 | |
| 87 | if not ('0' <= s[j] <= '9'): | 87 | |
| 88 | return None | 88 | |
| 89 | j += 1 | 89 | |
| 90 | if j >= m or j + 1 >= m: | 90 | |
| 91 | return None | 91 | |
| 92 | cnt = int(s[i + 2:j]) | 92 | |
| 93 | ch = s[j + 1] | 93 | |
| 94 | out.append(ch * cnt) | 94 | |
| 95 | i = j + 2 | 95 | |
| 96 | return "".join(out) | 96 | |
| 97 | 97 | ||
| 98 | enc = enc_rle(data) | 98 | |
| 99 | if verify(enc, dec_rle): | 99 | |
| 100 | if len(enc) < best: | 100 | |
| 101 | best = len(enc) | 101 | |
| 102 | 102 | ||
| 103 | # Scheme 3: whole-string repetition encoding | 103 | |
| 104 | if len(fresh) >= 5: | 104 | |
| 105 | ESC = fresh[3] | 105 | |
| 106 | SEP = fresh[4] | 106 | |
| 107 | 107 | ||
| 108 | def smallest_period(s): | 108 | |
| 109 | m = len(s) | 109 | |
| 110 | pi = [0] * m | 110 | |
| 111 | j = 0 | 111 | |
| 112 | for i in range(1, m): | 112 | |
| 113 | while j > 0 and s[i] != s[j]: | 113 | |
| 114 | j = pi[j - 1] | 114 | |
| 115 | if s[i] == s[j]: | 115 | |
| 116 | j += 1 | 116 | |
| 117 | pi[i] = j | 117 | |
| 118 | p = m - pi[-1] | 118 | |
| 119 | if p < m and m % p == 0: | 119 | |
| 120 | return p | 120 | |
| 121 | return m | 121 | |
| 122 | 122 | ||
| 123 | p = smallest_period(data) | 123 | |
| 124 | if p < n: | 124 | |
| 125 | base = data[:p] | 125 | |
| 126 | reps = n // p | 126 | |
| 127 | enc = ESC + str(reps) + SEP + base | 127 | |
| 128 | 128 | ||
| 129 | def dec_rep(s): | 129 | |
| 130 | if not s or s[0] != ESC: | 130 | |
| 131 | return None | 131 | |
| 132 | i = 1 | 132 | |
| 133 | j = i | 133 | |
| 134 | while j < len(s) and s[j] != SEP: | 134 | |
| 135 | if not ('0' <= s[j] <= '9'): | 135 | |
| 136 | return None | 136 | |
| 137 | j += 1 | 137 | |
| 138 | if j == i or j >= len(s): | 138 | |
| 139 | return None | 139 | |
| 140 | reps2 = int(s[i:j]) | 140 | |
| 141 | base2 = s[j + 1:] | 141 | |
| 142 | return base2 * reps2 | 142 | |
| 143 | 143 | ||
| 144 | if verify(enc, dec_rep): | 144 | |
| 145 | if len(enc) < best: | 145 | |
| 146 | best = len(enc) | 146 | |
| 147 | 147 | ||
| 148 | # Build suffix automaton to mine repeated substrings | 148 | |
| 149 | class State: | 149 | |
| 150 | __slots__ = ("next", "link", "len", "occ", "first") | 150 | |
| 151 | def __init__(self): | 151 | |
| 152 | self.next = {} | 152 | |
| 153 | self.link = -1 | 153 | |
| 154 | self.len = 0 | 154 | |
| 155 | self.occ = 0 | 155 | |
| 156 | self.first = -1 | 156 | |
| 157 | 157 | ||
| 158 | st = [State()] | 158 | |
| 159 | last = 0 | 159 | |
| 160 | pos_state = [0] * n | 160 | |
| 161 | 161 | ||
| 162 | for idx, ch in enumerate(data): | 162 | |
| 163 | cur = len(st) | 163 | |
| 164 | st.append(State()) | 164 | |
| 165 | st[cur].len = st[last].len + 1 | 165 | |
| 166 | st[cur].occ = 1 | 166 | |
| 167 | st[cur].first = idx | 167 | |
| 168 | 168 | ||
| 169 | p = last | 169 | |
| 170 | while p >= 0 and ch not in st[p].next: | 170 | |
| 171 | st[p].next[ch] = cur | 171 | |
| 172 | p = st[p].link | 172 | |
| 173 | 173 | ||
| 174 | if p == -1: | 174 | |
| 175 | st[cur].link = 0 | 175 | |
| 176 | else: | 176 | |
| 177 | q = st[p].next[ch] | 177 | |
| 178 | if st[p].len + 1 == st[q].len: | 178 | |
| 179 | st[cur].link = q | 179 | |
| 180 | else: | 180 | |
| 181 | clone = len(st) | 181 | |
| 182 | st.append(State()) | 182 | |
| 183 | st[clone].next = st[q].next.copy() | 183 | |
| 184 | st[clone].link = st[q].link | 184 | |
| 185 | st[clone].len = st[p].len + 1 | 185 | |
| 186 | st[clone].first = st[q].first | 186 | |
| 187 | st[clone].occ = 0 | 187 | |
| 188 | while p >= 0 and st[p].next.get(ch) == q: | 188 | |
| 189 | st[p].next[ch] = clone | 189 | |
| 190 | p = st[p].link | 190 | |
| 191 | st[q].link = clone | 191 | |
| 192 | st[cur].link = clone | 192 | |
| 193 | last = cur | 193 | |
| 194 | pos_state[idx] = cur | 194 | |
| 195 | 195 | ||
| 196 | # propagate occurrence counts | 196 | |
| 197 | max_len = 0 | 197 | |
| 198 | for s in st: | 198 | |
| 199 | if s.len > max_len: | 199 | |
| 200 | max_len = s.len | 200 | |
| 201 | buckets = [[] for _ in range(max_len + 1)] | 201 | |
| 202 | for i, s in enumerate(st): | 202 | |
| 203 | buckets[s.len].append(i) | 203 | |
| 204 | for L in range(max_len, 0, -1): | 204 | |
| 205 | for v in buckets[L]: | 205 | |
| 206 | link = st[v].link | 206 | |
| 207 | if link >= 0: | 207 | |
| 208 | st[link].occ += st[v].occ | 208 | |
| 209 | 209 | ||
| 210 | # Candidate substrings from SAM: prioritize net savings | 210 | |
| 211 | candidates = [] | 211 | |
| 212 | seen_sub = set() | 212 | |
| 213 | for v in range(1, len(st)): | 213 | |
| 214 | occ = st[v].occ | 214 | |
| 215 | L = st[v].len | 215 | |
| 216 | if occ < 3 or L < 2: | 216 | |
| 217 | continue | 217 | |
| 218 | # Use the state's maximal substring | 218 | |
| 219 | end = st[v].first | 219 | |
| 220 | start = end - L + 1 | 220 | |
| 221 | if start < 0: | 221 | |
| 222 | continue | 222 | |
| 223 | sub = data[start:end + 1] | 223 | |
| 224 | if sub in seen_sub: | 224 | |
| 225 | continue | 225 | |
| 226 | seen_sub.add(sub) | 226 | |
| 227 | 227 | ||
| 228 | # approximate gain if replaced by 1-char token: | 228 | |
| 229 | # save occ*(L-1), pay rule cost about L+3 | 229 | |
| 230 | gain = occ * (L - 1) - (L + 3) | 230 | |
| 231 | if gain > 0: | 231 | |
| 232 | candidates.append((gain, occ, L, sub)) | 232 | |
| 233 | 233 | ||
| 234 | candidates.sort(reverse=True) | 234 | |
| 235 | candidates = candidates[:8] | 235 | |
| 236 | 236 | ||
| 237 | # Scheme 4: token dictionary with non-overlapping greedy replacement | 237 | |
| 238 | if candidates and len(fresh) >= 16: | 238 | |
| 239 | SEP = fresh[5] | 239 | |
| 240 | toks = fresh[6:16] | 240 | |
| 241 | 241 | ||
| 242 | # choose non-conflicting useful macros iteratively | 242 | |
| 243 | macros = [] | 243 | |
| 244 | cur = data | 244 | |
| 245 | used_tokens = 0 | 245 | |
| 246 | 246 | ||
| 247 | for _, _, _, sub in candidates: | 247 | |
| 248 | if used_tokens >= len(toks): | 248 | |
| 249 | break | 249 | |
| 250 | if len(sub) < 2: | 250 | |
| 251 | continue | 251 | |
| 252 | 252 | ||
| 253 | # count non-overlapping occurrences in current string | 253 | |
| 254 | cnt = 0 | 254 | |
| 255 | i = 0 | 255 | |
| 256 | L = len(sub) | 256 | |
| 257 | while i <= len(cur) - L: | 257 | |
| 258 | if cur[i:i + L] == sub: | 258 | |
| 259 | cnt += 1 | 259 | |
| 260 | i += L | 260 | |
| 261 | else: | 261 | |
| 262 | i += 1 | 262 | |
| 263 | if cnt < 3: | 263 | |
| 264 | continue | 264 | |
| 265 | 265 | ||
| 266 | tok = toks[used_tokens] | 266 | |
| 267 | replaced = [] | 267 | |
| 268 | i = 0 | 268 | |
| 269 | while i < len(cur): | 269 | |
| 270 | if i <= len(cur) - L and cur[i:i + L] == sub: | 270 | |
| 271 | replaced.append(tok) | 271 | |
| 272 | i += L | 272 | |
| 273 | else: | 273 | |
| 274 | replaced.append(cur[i]) | 274 | |
| 275 | i += 1 | 275 | |
| 276 | nxt = "".join(replaced) | 276 | |
| 277 | 277 | ||
| 278 | header_cost = len(sub) + 2 | 278 | |
| 279 | if len(cur) - len(nxt) <= header_cost: | 279 | |
| 280 | continue | 280 | |
| 281 | 281 | ||
| 282 | macros.append((tok, sub)) | 282 | |
| 283 | cur = nxt | 283 | |
| 284 | used_tokens += 1 | 284 | |
| 285 | 285 | ||
| 286 | if macros: | 286 | |
| 287 | # format: SEP + k + SEP + tok+sub+SEP... + body | 287 | |
| 288 | enc_parts = [SEP, str(len(macros)), SEP] | 288 | |
| 289 | for tok, sub in macros: | 289 | |
| 290 | enc_parts.append(tok) | 290 | |
| 291 | enc_parts.append(sub) | 291 | |
| 292 | enc_parts.append(SEP) | 292 | |
| 293 | enc_parts.append(cur) | 293 | |
| 294 | enc = "".join(enc_parts) | 294 | |
| 295 | 295 | ||
| 296 | def dec_dict(s): | 296 | |
| 297 | if not s: | 297 | |
| 298 | return None | 298 | |
| 299 | sep = s[0] | 299 | |
| 300 | i = 1 | 300 | |
| 301 | j = i | 301 | |
| 302 | while j < len(s) and s[j] != sep: | 302 | |
| 303 | if not ('0' <= s[j] <= '9'): | 303 | |
| 304 | return None | 304 | |
| 305 | j += 1 | 305 | |
| 306 | if j == i or j >= len(s): | 306 | |
| 307 | return None | 307 | |
| 308 | k = int(s[i:j]) | 308 | |
| 309 | i = j + 1 | 309 | |
| 310 | rules = [] | 310 | |
| 311 | for _ in range(k): | 311 | |
| 312 | if i >= len(s): | 312 | |
| 313 | return None | 313 | |
| 314 | tok = s[i] | 314 | |
| 315 | i += 1 | 315 | |
| 316 | j = i | 316 | |
| 317 | while j < len(s) and s[j] != sep: | 317 | |
| 318 | j += 1 | 318 | |
| 319 | if j >= len(s): | 319 | |
| 320 | return None | 320 | |
| 321 | sub = s[i:j] | 321 | |
| 322 | rules.append((tok, sub)) | 322 | |
| 323 | i = j + 1 | 323 | |
| 324 | body = s[i:] | 324 | |
| 325 | for tok, sub in reversed(rules): | 325 | |
| 326 | out = [] | 326 | |
| 327 | for ch in body: | 327 | |
| 328 | if ch == tok: | 328 | |
| 329 | out.append(sub) | 329 | |
| 330 | else: | 330 | |
| 331 | out.append(ch) | 331 | |
| 332 | body = "".join(out) | 332 | |
| 333 | return body | 333 | |
| 334 | 334 | ||
| 335 | if verify(enc, dec_dict): | 335 | |
| 336 | if len(enc) < best: | 336 | |
| 337 | best = len(enc) | 337 | |
| 338 | 338 | ||
| 339 | ratio = best / n | 339 | |
| 340 | if ratio < 0: | 340 | |
| 341 | return 999.0 | 341 | |
| 342 | return float(ratio) | 342 | |
| 343 | except: | 343 | |
| 344 | return 999.0 | 344 |
1def solve(input):2 try:3 data = input.get("data", "")4 if not isinstance(data, str):5 data = str(data)67 n = len(data)8 if n == 0:9 return 0.01011 # Use a graph/trie-inspired approach:12 # Build a suffix automaton (compact substring graph), then use it to13 # estimate repeated-substring opportunities and choose the best among:14 # 1) literal15 # 2) run-length encoding16 # 3) repeat-block encoding for whole-string periodicity17 # 4) dictionary tokenization using top repeated substrings from SAM18 #19 # Compression size is measured as encoded character count.20 # Every scheme is losslessly decoded and verified.2122 def verify(encoded, decoder):23 try:24 return decoder(encoded) == data25 except:26 return False2728 best = n # allow "store as-is" baseline2930 # Find fresh control chars from Unicode private-use area31 used = set(data)32 fresh = []33 for cp in range(0xE000, 0xF8FF + 1):34 ch = chr(cp)35 if ch not in used:36 fresh.append(ch)37 if len(fresh) >= 32:38 break3940 # Scheme 1: literal with no header cost in score model41 # since ratio uses encoded_size / original_size, raw storage is n.42 if data == data:43 best = min(best, n)4445 # Scheme 2: robust RLE with explicit escapes46 if len(fresh) >= 3:47 ESC = fresh[0]48 TAG_R = fresh[1]49 SEP = fresh[2]5051 def enc_rle(s):52 out = []53 i = 054 m = len(s)55 while i < m:56 j = i + 157 while j < m and s[j] == s[i]:58 j += 159 run = j - i60 ch = s[i]61 if run >= 4 or ch in (ESC, TAG_R, SEP):62 out.append(ESC)63 out.append(TAG_R)64 out.append(str(run))65 out.append(SEP)66 out.append(ch)67 else:68 out.append(ch * run)69 i = j70 return "".join(out)7172 def dec_rle(s):73 out = []74 i = 075 m = len(s)76 while i < m:77 if s[i] != ESC:78 out.append(s[i])79 i += 180 else:81 if i + 4 >= m or s[i + 1] != TAG_R:82 return None83 j = i + 284 if j >= m or not ('0' <= s[j] <= '9'):85 return None86 while j < m and s[j] != SEP:87 if not ('0' <= s[j] <= '9'):88 return None89 j += 190 if j >= m or j + 1 >= m:91 return None92 cnt = int(s[i + 2:j])93 ch = s[j + 1]94 out.append(ch * cnt)95 i = j + 296 return "".join(out)9798 enc = enc_rle(data)99 if verify(enc, dec_rle):100 if len(enc) < best:101 best = len(enc)102103 # Scheme 3: whole-string repetition encoding104 if len(fresh) >= 5:105 ESC = fresh[3]106 SEP = fresh[4]107108 def smallest_period(s):109 m = len(s)110 pi = [0] * m111 j = 0112 for i in range(1, m):113 while j > 0 and s[i] != s[j]:114 j = pi[j - 1]115 if s[i] == s[j]:116 j += 1117 pi[i] = j118 p = m - pi[-1]119 if p < m and m % p == 0:120 return p121 return m122123 p = smallest_period(data)124 if p < n:125 base = data[:p]126 reps = n // p127 enc = ESC + str(reps) + SEP + base128129 def dec_rep(s):130 if not s or s[0] != ESC:131 return None132 i = 1133 j = i134 while j < len(s) and s[j] != SEP:135 if not ('0' <= s[j] <= '9'):136 return None137 j += 1138 if j == i or j >= len(s):139 return None140 reps2 = int(s[i:j])141 base2 = s[j + 1:]142 return base2 * reps2143144 if verify(enc, dec_rep):145 if len(enc) < best:146 best = len(enc)147148 # Build suffix automaton to mine repeated substrings149 class State:150 __slots__ = ("next", "link", "len", "occ", "first")151 def __init__(self):152 self.next = {}153 self.link = -1154 self.len = 0155 self.occ = 0156 self.first = -1157158 st = [State()]159 last = 0160 pos_state = [0] * n161162 for idx, ch in enumerate(data):163 cur = len(st)164 st.append(State())165 st[cur].len = st[last].len + 1166 st[cur].occ = 1167 st[cur].first = idx168169 p = last170 while p >= 0 and ch not in st[p].next:171 st[p].next[ch] = cur172 p = st[p].link173174 if p == -1:175 st[cur].link = 0176 else:177 q = st[p].next[ch]178 if st[p].len + 1 == st[q].len:179 st[cur].link = q180 else:181 clone = len(st)182 st.append(State())183 st[clone].next = st[q].next.copy()184 st[clone].link = st[q].link185 st[clone].len = st[p].len + 1186 st[clone].first = st[q].first187 st[clone].occ = 0188 while p >= 0 and st[p].next.get(ch) == q:189 st[p].next[ch] = clone190 p = st[p].link191 st[q].link = clone192 st[cur].link = clone193 last = cur194 pos_state[idx] = cur195196 # propagate occurrence counts197 max_len = 0198 for s in st:199 if s.len > max_len:200 max_len = s.len201 buckets = [[] for _ in range(max_len + 1)]202 for i, s in enumerate(st):203 buckets[s.len].append(i)204 for L in range(max_len, 0, -1):205 for v in buckets[L]:206 link = st[v].link207 if link >= 0:208 st[link].occ += st[v].occ209210 # Candidate substrings from SAM: prioritize net savings211 candidates = []212 seen_sub = set()213 for v in range(1, len(st)):214 occ = st[v].occ215 L = st[v].len216 if occ < 3 or L < 2:217 continue218 # Use the state's maximal substring219 end = st[v].first220 start = end - L + 1221 if start < 0:222 continue223 sub = data[start:end + 1]224 if sub in seen_sub:225 continue226 seen_sub.add(sub)227228 # approximate gain if replaced by 1-char token:229 # save occ*(L-1), pay rule cost about L+3230 gain = occ * (L - 1) - (L + 3)231 if gain > 0:232 candidates.append((gain, occ, L, sub))233234 candidates.sort(reverse=True)235 candidates = candidates[:8]236237 # Scheme 4: token dictionary with non-overlapping greedy replacement238 if candidates and len(fresh) >= 16:239 SEP = fresh[5]240 toks = fresh[6:16]241242 # choose non-conflicting useful macros iteratively243 macros = []244 cur = data245 used_tokens = 0246247 for _, _, _, sub in candidates:248 if used_tokens >= len(toks):249 break250 if len(sub) < 2:251 continue252253 # count non-overlapping occurrences in current string254 cnt = 0255 i = 0256 L = len(sub)257 while i <= len(cur) - L:258 if cur[i:i + L] == sub:259 cnt += 1260 i += L261 else:262 i += 1263 if cnt < 3:264 continue265266 tok = toks[used_tokens]267 replaced = []268 i = 0269 while i < len(cur):270 if i <= len(cur) - L and cur[i:i + L] == sub:271 replaced.append(tok)272 i += L273 else:274 replaced.append(cur[i])275 i += 1276 nxt = "".join(replaced)277278 header_cost = len(sub) + 2279 if len(cur) - len(nxt) <= header_cost:280 continue281282 macros.append((tok, sub))283 cur = nxt284 used_tokens += 1285286 if macros:287 # format: SEP + k + SEP + tok+sub+SEP... + body288 enc_parts = [SEP, str(len(macros)), SEP]289 for tok, sub in macros:290 enc_parts.append(tok)291 enc_parts.append(sub)292 enc_parts.append(SEP)293 enc_parts.append(cur)294 enc = "".join(enc_parts)295296 def dec_dict(s):297 if not s:298 return None299 sep = s[0]300 i = 1301 j = i302 while j < len(s) and s[j] != sep:303 if not ('0' <= s[j] <= '9'):304 return None305 j += 1306 if j == i or j >= len(s):307 return None308 k = int(s[i:j])309 i = j + 1310 rules = []311 for _ in range(k):312 if i >= len(s):313 return None314 tok = s[i]315 i += 1316 j = i317 while j < len(s) and s[j] != sep:318 j += 1319 if j >= len(s):320 return None321 sub = s[i:j]322 rules.append((tok, sub))323 i = j + 1324 body = s[i:]325 for tok, sub in reversed(rules):326 out = []327 for ch in body:328 if ch == tok:329 out.append(sub)330 else:331 out.append(ch)332 body = "".join(out)333 return body334335 if verify(enc, dec_dict):336 if len(enc) < best:337 best = len(enc)338339 ratio = best / n340 if ratio < 0:341 return 999.0342 return float(ratio)343 except:344 return 999.01def solve(input):2 data = input.get("data", "")3 if not isinstance(data, str) or not data:4 return 999.056 # Mathematical/analytical approach: Entropy-based redundancy calculation7 8 from collections import Counter9 from math import log21011 def entropy(s):12 probabilities = [freq / len(s) for freq in Counter(s).values()]13 return -sum(p * log2(p) if p > 0 else 0 for p in probabilities)1415 def redundancy(s):16 max_entropy = log2(len(set(s))) if len(set(s)) > 1 else 017 actual_entropy = entropy(s)18 return max_entropy - actual_entropy1920 # Calculate reduction in size possible based on redundancy21 reduction_potential = redundancy(data)2223 # Assuming compression is achieved based on redundancy24 max_possible_compression_ratio = 1.0 - (reduction_potential / log2(len(data)))25 26 # Qualitative check if max_possible_compression_ratio makes sense27 if max_possible_compression_ratio < 0.0 or max_possible_compression_ratio > 1.0:28 return 999.02930 # Verify compression is lossless (hypothetical check here)31 # Normally, if we had a compression algorithm, we'd test decompress(compress(data)) == data32 33 # Returning the hypothetical compression performance34 return max_possible_compression_ratio