Solution #99326a14-b924-43d8-9a28-ae8a8f6ffe60
completedScore
32% (0/5)
Runtime
2.12ms
Delta
+71.0% vs parent
-66.7% vs best
Improved from parent
Score
32% (0/5)
Runtime
2.12ms
Delta
+71.0% vs parent
-66.7% vs best
Improved from parent
def solve(input):
try:
data = input.get("data", "")
if not isinstance(data, str):
data = str(data)
n = len(data)
if n == 0:
return 0.0
# Novel approach:
# Evaluate several robust, exactly-decodable text compressors and
# return the best verified ratio. This avoids overfitting to one scheme.
#
# Schemes:
# 1) Pure literal
# 2) RLE with escape
# 3) BPE-style dictionary substitution using rare private-use chars
# 4) Queue/stack inspired macro system: repeated substring macros + escapes
#
# Ratio is measured in "encoded character count" / original character count.
used = set(data)
def verify(encoded, decoder):
try:
dec = decoder(encoded)
return dec == data
except:
return False
best = float(n) # encoded size
# 1) Pure literal with one-char header
# format: 'L' + data
enc1 = "L" + data
def dec1(s):
if not s or s[0] != "L":
return None
return s[1:]
if verify(enc1, dec1):
best = min(best, len(enc1))
# choose escape chars from private use area if possible
esc = None
for cp in range(0xE000, 0xF8FF + 1):
ch = chr(cp)
if ch not in used:
esc = ch
break
# 2) RLE with escape encoding
# raw char c is emitted as c unless c==esc, then esc esc
# run len>=4 emitted as esc 'R' <count digits> esc <char>
if esc is not None:
def enc_rle(s):
out = []
i = 0
m = len(s)
while i < m:
j = i + 1
while j < m and s[j] == s[i]:
j += 1
run = j - i
ch = s[i]
if run >= 4:
out.append(esc)
out.append("R")
out.append(str(run))
out.append(esc)
out.append(ch)
else:
for _ in range(run):
if ch == esc:
out.append(esc)
out.append(esc)
else:
out.append(ch)
i = j
return "".join(out)
def dec_rle(s):
out = []
i = 0
m = len(s)
while i < m:
ch = s[i]
if ch != esc:
out.append(ch)
i += 1
else:
if i + 1 >= m:
return None
t = s[i + 1]
if t == esc:
out.append(esc)
i += 2
elif t == "R":
k = i + 2
if k >= m:
return None
start = k
while k < m and s[k] != esc:
if s[k] < "0" or s[k] > "9":
return None
k += 1
if k == start or k + 1 >= m:
return None
cnt = int(s[start:k])
out.extend(s[k + 1] for _ in range(cnt))
i = k + 2
else:
return None
return "".join(out)
enc2 = enc_rle(data)
if verify(enc2, dec_rle):
best = min(best, len(enc2))
# 3) BPE-style iterative pair substitution
# Header:
# M <sep> k <sep> [tok pair tok sep]* encoded_body
# token chars are fresh private-use chars not in input.
fresh = []
for cp in range(0xE000, 0xF8FF + 1):
ch = chr(cp)
if ch not in used:
fresh.append(ch)
if len(fresh) >= 64:
break
if len(fresh) >= 3:
sep = fresh[0]
available = fresh[1:]
def bpe_encode(s):
rules = []
cur = s
avail_idx = 0
# iterative best-pair replacement
for _ in range(min(32, len(available))):
freq = {}
for i in range(len(cur) - 1):
p = cur[i:i+2]
freq[p] = freq.get(p, 0) + 1
best_pair = None
best_count = 0
for p, c in freq.items():
if c > best_count:
best_pair = p
best_count = c
if best_pair is None or best_count < 3 or avail_idx >= len(available):
break
tok = available[avail_idx]
avail_idx += 1
new_cur_parts = []
i = 0
changed = 0
while i < len(cur):
if i + 1 < len(cur) and cur[i:i+2] == best_pair:
new_cur_parts.append(tok)
i += 2
changed += 1
else:
new_cur_parts.append(cur[i])
i += 1
new_cur = "".join(new_cur_parts)
# only keep if worthwhile after rule cost
# rule storage cost about 4 chars: tok + pair + sep
if len(new_cur) + len(rules) * 4 + 4 >= len(cur) + max(0, len(rules)-1) * 4 + 4:
break
rules.append((tok, best_pair))
cur = new_cur
header = ["M", sep, str(len(rules)), sep]
for tok, pair in rules:
header.append(tok)
header.append(pair)
header.append(sep)
return "".join(header) + cur, rules, cur
def bpe_decode(s):
if not s or s[0] != "M":
return None
if len(s) < 3:
return None
loc_sep = s[1]
i = 2
j = i
while j < len(s) and s[j] != loc_sep:
if s[j] < "0" or s[j] > "9":
return None
j += 1
if j == i or j >= len(s):
return None
k = int(s[i:j])
i = j + 1
rules = []
for _ in range(k):
if i + 3 > len(s):
return None
tok = s[i]
pair = s[i + 1:i + 3]
i += 3
if i > len(s) or i == len(s) or s[i] != loc_sep:
return None
i += 1
rules.append((tok, pair))
body = s[i:]
cur = body
for tok, pair in reversed(rules):
parts = []
for ch in cur:
if ch == tok:
parts.append(pair)
else:
parts.append(ch)
cur = "".join(parts)
return cur
enc3, _, _ = bpe_encode(data)
if verify(enc3, bpe_decode):
best = min(best, len(enc3))
# 4) Macro compression using repeated substrings and a stack/queue style expansion.
# Build up to a few macros for profitable repeated substrings.
if len(fresh) >= 8:
sep = fresh[0]
cmd = fresh[1]
tokens = fresh[2:8]
def macro_encode(s):
# Find profitable repeated substrings of length 3..8
best_subs = []
for L in range(3, min(9, len(s) + 1)):
count = {}
for i in range(len(s) - L + 1):
sub = s[i:i+L]
count[sub] = count.get(sub, 0) + 1
cand = []
for sub, c in count.items():
gain = c * (L - 1) - (L + 3) # rough net gain with rule cost
if c >= 3 and gain > 0:
cand.append((gain, c, sub))
cand.sort(reverse=True)
for item in cand[:2]:
best_subs.append(item)
best_subs.sort(reverse=True)
macros = []
used_tok = 0
cur = s
for _, _, sub in best_subs:
if used_tok >= len(tokens):
break
if sub not in cur:
continue
tok = tokens[used_tok]
used_tok += 1
parts = []
i = 0
reps = 0
L = len(sub)
while i < len(cur):
if i + L <= len(cur) and cur[i:i+L] == sub:
parts.append(tok)
i += L
reps += 1
else:
parts.append(cur[i])
i += 1
if reps < 3:
used_tok -= 1
continue
new_cur = "".join(parts)
if len(new_cur) + len(sub) + 3 >= len(cur):
used_tok -= 1
continue
macros.append((tok, sub))
cur = new_cur
# escape cmd/sep/tokens if they somehow appear in cur (they shouldn't)
header = ["Q", sep, str(len(macros)), sep]
for tok, sub in macros:
header.append(tok)
header.append(sub)
header.append(sep)
return "".join(header) + cur, macros, cur
def macro_decode(s):
if not s or s[0] != "Q":
return None
if len(s) < 3:
return None
loc_sep = s[1]
i = 2
j = i
while j < len(s) and s[j] != loc_sep:
if s[j] < "0" or s[j] > "9":
return None
j += 1
if j == i or j >= len(s):
return None
k = int(s[i:j])
i = j + 1
macros = []
for _ in range(k):
if i >= len(s):
return None
tok = s[i]
i += 1
j = i
while j < len(s) and s[j] != loc_sep:
j += 1
if j >= len(s):
return None
sub = s[i:j]
macros.append((tok, sub))
i = j + 1
cur = s[i:]
# stack-like repeated expansion in reverse declaration order
for tok, sub in reversed(macros):
parts = []
for ch in cur:
if ch == tok:
parts.append(sub)
else:
parts.append(ch)
cur = "".join(parts)
return cur
enc4, _, _ = macro_encode(data)
if verify(enc4, macro_decode):
best = min(best, len(enc4))
ratio = best / n
if ratio < 0:
return 999.0
return ratio
except:
return 999.0Score Difference
-64.5%
Runtime Advantage
1.99ms slower
Code Size
346 vs 34 lines
| # | Your Solution | # | Champion |
|---|---|---|---|
| 1 | def solve(input): | 1 | def solve(input): |
| 2 | try: | 2 | data = input.get("data", "") |
| 3 | data = input.get("data", "") | 3 | if not isinstance(data, str) or not data: |
| 4 | if not isinstance(data, str): | 4 | return 999.0 |
| 5 | data = str(data) | 5 | |
| 6 | 6 | # Mathematical/analytical approach: Entropy-based redundancy calculation | |
| 7 | n = len(data) | 7 | |
| 8 | if n == 0: | 8 | from collections import Counter |
| 9 | return 0.0 | 9 | from math import log2 |
| 10 | 10 | ||
| 11 | # Novel approach: | 11 | def entropy(s): |
| 12 | # Evaluate several robust, exactly-decodable text compressors and | 12 | probabilities = [freq / len(s) for freq in Counter(s).values()] |
| 13 | # return the best verified ratio. This avoids overfitting to one scheme. | 13 | return -sum(p * log2(p) if p > 0 else 0 for p in probabilities) |
| 14 | # | 14 | |
| 15 | # Schemes: | 15 | def redundancy(s): |
| 16 | # 1) Pure literal | 16 | max_entropy = log2(len(set(s))) if len(set(s)) > 1 else 0 |
| 17 | # 2) RLE with escape | 17 | actual_entropy = entropy(s) |
| 18 | # 3) BPE-style dictionary substitution using rare private-use chars | 18 | return max_entropy - actual_entropy |
| 19 | # 4) Queue/stack inspired macro system: repeated substring macros + escapes | 19 | |
| 20 | # | 20 | # Calculate reduction in size possible based on redundancy |
| 21 | # Ratio is measured in "encoded character count" / original character count. | 21 | reduction_potential = redundancy(data) |
| 22 | 22 | ||
| 23 | used = set(data) | 23 | # Assuming compression is achieved based on redundancy |
| 24 | 24 | max_possible_compression_ratio = 1.0 - (reduction_potential / log2(len(data))) | |
| 25 | def verify(encoded, decoder): | 25 | |
| 26 | try: | 26 | # Qualitative check if max_possible_compression_ratio makes sense |
| 27 | dec = decoder(encoded) | 27 | if max_possible_compression_ratio < 0.0 or max_possible_compression_ratio > 1.0: |
| 28 | return dec == data | 28 | return 999.0 |
| 29 | except: | 29 | |
| 30 | return False | 30 | # Verify compression is lossless (hypothetical check here) |
| 31 | 31 | # Normally, if we had a compression algorithm, we'd test decompress(compress(data)) == data | |
| 32 | best = float(n) # encoded size | 32 | |
| 33 | 33 | # Returning the hypothetical compression performance | |
| 34 | # 1) Pure literal with one-char header | 34 | return max_possible_compression_ratio |
| 35 | # format: 'L' + data | 35 | |
| 36 | enc1 = "L" + data | 36 | |
| 37 | def dec1(s): | 37 | |
| 38 | if not s or s[0] != "L": | 38 | |
| 39 | return None | 39 | |
| 40 | return s[1:] | 40 | |
| 41 | if verify(enc1, dec1): | 41 | |
| 42 | best = min(best, len(enc1)) | 42 | |
| 43 | 43 | ||
| 44 | # choose escape chars from private use area if possible | 44 | |
| 45 | esc = None | 45 | |
| 46 | for cp in range(0xE000, 0xF8FF + 1): | 46 | |
| 47 | ch = chr(cp) | 47 | |
| 48 | if ch not in used: | 48 | |
| 49 | esc = ch | 49 | |
| 50 | break | 50 | |
| 51 | 51 | ||
| 52 | # 2) RLE with escape encoding | 52 | |
| 53 | # raw char c is emitted as c unless c==esc, then esc esc | 53 | |
| 54 | # run len>=4 emitted as esc 'R' <count digits> esc <char> | 54 | |
| 55 | if esc is not None: | 55 | |
| 56 | def enc_rle(s): | 56 | |
| 57 | out = [] | 57 | |
| 58 | i = 0 | 58 | |
| 59 | m = len(s) | 59 | |
| 60 | while i < m: | 60 | |
| 61 | j = i + 1 | 61 | |
| 62 | while j < m and s[j] == s[i]: | 62 | |
| 63 | j += 1 | 63 | |
| 64 | run = j - i | 64 | |
| 65 | ch = s[i] | 65 | |
| 66 | if run >= 4: | 66 | |
| 67 | out.append(esc) | 67 | |
| 68 | out.append("R") | 68 | |
| 69 | out.append(str(run)) | 69 | |
| 70 | out.append(esc) | 70 | |
| 71 | out.append(ch) | 71 | |
| 72 | else: | 72 | |
| 73 | for _ in range(run): | 73 | |
| 74 | if ch == esc: | 74 | |
| 75 | out.append(esc) | 75 | |
| 76 | out.append(esc) | 76 | |
| 77 | else: | 77 | |
| 78 | out.append(ch) | 78 | |
| 79 | i = j | 79 | |
| 80 | return "".join(out) | 80 | |
| 81 | 81 | ||
| 82 | def dec_rle(s): | 82 | |
| 83 | out = [] | 83 | |
| 84 | i = 0 | 84 | |
| 85 | m = len(s) | 85 | |
| 86 | while i < m: | 86 | |
| 87 | ch = s[i] | 87 | |
| 88 | if ch != esc: | 88 | |
| 89 | out.append(ch) | 89 | |
| 90 | i += 1 | 90 | |
| 91 | else: | 91 | |
| 92 | if i + 1 >= m: | 92 | |
| 93 | return None | 93 | |
| 94 | t = s[i + 1] | 94 | |
| 95 | if t == esc: | 95 | |
| 96 | out.append(esc) | 96 | |
| 97 | i += 2 | 97 | |
| 98 | elif t == "R": | 98 | |
| 99 | k = i + 2 | 99 | |
| 100 | if k >= m: | 100 | |
| 101 | return None | 101 | |
| 102 | start = k | 102 | |
| 103 | while k < m and s[k] != esc: | 103 | |
| 104 | if s[k] < "0" or s[k] > "9": | 104 | |
| 105 | return None | 105 | |
| 106 | k += 1 | 106 | |
| 107 | if k == start or k + 1 >= m: | 107 | |
| 108 | return None | 108 | |
| 109 | cnt = int(s[start:k]) | 109 | |
| 110 | out.extend(s[k + 1] for _ in range(cnt)) | 110 | |
| 111 | i = k + 2 | 111 | |
| 112 | else: | 112 | |
| 113 | return None | 113 | |
| 114 | return "".join(out) | 114 | |
| 115 | 115 | ||
| 116 | enc2 = enc_rle(data) | 116 | |
| 117 | if verify(enc2, dec_rle): | 117 | |
| 118 | best = min(best, len(enc2)) | 118 | |
| 119 | 119 | ||
| 120 | # 3) BPE-style iterative pair substitution | 120 | |
| 121 | # Header: | 121 | |
| 122 | # M <sep> k <sep> [tok pair tok sep]* encoded_body | 122 | |
| 123 | # token chars are fresh private-use chars not in input. | 123 | |
| 124 | fresh = [] | 124 | |
| 125 | for cp in range(0xE000, 0xF8FF + 1): | 125 | |
| 126 | ch = chr(cp) | 126 | |
| 127 | if ch not in used: | 127 | |
| 128 | fresh.append(ch) | 128 | |
| 129 | if len(fresh) >= 64: | 129 | |
| 130 | break | 130 | |
| 131 | 131 | ||
| 132 | if len(fresh) >= 3: | 132 | |
| 133 | sep = fresh[0] | 133 | |
| 134 | available = fresh[1:] | 134 | |
| 135 | 135 | ||
| 136 | def bpe_encode(s): | 136 | |
| 137 | rules = [] | 137 | |
| 138 | cur = s | 138 | |
| 139 | avail_idx = 0 | 139 | |
| 140 | # iterative best-pair replacement | 140 | |
| 141 | for _ in range(min(32, len(available))): | 141 | |
| 142 | freq = {} | 142 | |
| 143 | for i in range(len(cur) - 1): | 143 | |
| 144 | p = cur[i:i+2] | 144 | |
| 145 | freq[p] = freq.get(p, 0) + 1 | 145 | |
| 146 | best_pair = None | 146 | |
| 147 | best_count = 0 | 147 | |
| 148 | for p, c in freq.items(): | 148 | |
| 149 | if c > best_count: | 149 | |
| 150 | best_pair = p | 150 | |
| 151 | best_count = c | 151 | |
| 152 | if best_pair is None or best_count < 3 or avail_idx >= len(available): | 152 | |
| 153 | break | 153 | |
| 154 | tok = available[avail_idx] | 154 | |
| 155 | avail_idx += 1 | 155 | |
| 156 | 156 | ||
| 157 | new_cur_parts = [] | 157 | |
| 158 | i = 0 | 158 | |
| 159 | changed = 0 | 159 | |
| 160 | while i < len(cur): | 160 | |
| 161 | if i + 1 < len(cur) and cur[i:i+2] == best_pair: | 161 | |
| 162 | new_cur_parts.append(tok) | 162 | |
| 163 | i += 2 | 163 | |
| 164 | changed += 1 | 164 | |
| 165 | else: | 165 | |
| 166 | new_cur_parts.append(cur[i]) | 166 | |
| 167 | i += 1 | 167 | |
| 168 | new_cur = "".join(new_cur_parts) | 168 | |
| 169 | 169 | ||
| 170 | # only keep if worthwhile after rule cost | 170 | |
| 171 | # rule storage cost about 4 chars: tok + pair + sep | 171 | |
| 172 | if len(new_cur) + len(rules) * 4 + 4 >= len(cur) + max(0, len(rules)-1) * 4 + 4: | 172 | |
| 173 | break | 173 | |
| 174 | 174 | ||
| 175 | rules.append((tok, best_pair)) | 175 | |
| 176 | cur = new_cur | 176 | |
| 177 | 177 | ||
| 178 | header = ["M", sep, str(len(rules)), sep] | 178 | |
| 179 | for tok, pair in rules: | 179 | |
| 180 | header.append(tok) | 180 | |
| 181 | header.append(pair) | 181 | |
| 182 | header.append(sep) | 182 | |
| 183 | return "".join(header) + cur, rules, cur | 183 | |
| 184 | 184 | ||
| 185 | def bpe_decode(s): | 185 | |
| 186 | if not s or s[0] != "M": | 186 | |
| 187 | return None | 187 | |
| 188 | if len(s) < 3: | 188 | |
| 189 | return None | 189 | |
| 190 | loc_sep = s[1] | 190 | |
| 191 | i = 2 | 191 | |
| 192 | j = i | 192 | |
| 193 | while j < len(s) and s[j] != loc_sep: | 193 | |
| 194 | if s[j] < "0" or s[j] > "9": | 194 | |
| 195 | return None | 195 | |
| 196 | j += 1 | 196 | |
| 197 | if j == i or j >= len(s): | 197 | |
| 198 | return None | 198 | |
| 199 | k = int(s[i:j]) | 199 | |
| 200 | i = j + 1 | 200 | |
| 201 | rules = [] | 201 | |
| 202 | for _ in range(k): | 202 | |
| 203 | if i + 3 > len(s): | 203 | |
| 204 | return None | 204 | |
| 205 | tok = s[i] | 205 | |
| 206 | pair = s[i + 1:i + 3] | 206 | |
| 207 | i += 3 | 207 | |
| 208 | if i > len(s) or i == len(s) or s[i] != loc_sep: | 208 | |
| 209 | return None | 209 | |
| 210 | i += 1 | 210 | |
| 211 | rules.append((tok, pair)) | 211 | |
| 212 | body = s[i:] | 212 | |
| 213 | cur = body | 213 | |
| 214 | for tok, pair in reversed(rules): | 214 | |
| 215 | parts = [] | 215 | |
| 216 | for ch in cur: | 216 | |
| 217 | if ch == tok: | 217 | |
| 218 | parts.append(pair) | 218 | |
| 219 | else: | 219 | |
| 220 | parts.append(ch) | 220 | |
| 221 | cur = "".join(parts) | 221 | |
| 222 | return cur | 222 | |
| 223 | 223 | ||
| 224 | enc3, _, _ = bpe_encode(data) | 224 | |
| 225 | if verify(enc3, bpe_decode): | 225 | |
| 226 | best = min(best, len(enc3)) | 226 | |
| 227 | 227 | ||
| 228 | # 4) Macro compression using repeated substrings and a stack/queue style expansion. | 228 | |
| 229 | # Build up to a few macros for profitable repeated substrings. | 229 | |
| 230 | if len(fresh) >= 8: | 230 | |
| 231 | sep = fresh[0] | 231 | |
| 232 | cmd = fresh[1] | 232 | |
| 233 | tokens = fresh[2:8] | 233 | |
| 234 | 234 | ||
| 235 | def macro_encode(s): | 235 | |
| 236 | # Find profitable repeated substrings of length 3..8 | 236 | |
| 237 | best_subs = [] | 237 | |
| 238 | for L in range(3, min(9, len(s) + 1)): | 238 | |
| 239 | count = {} | 239 | |
| 240 | for i in range(len(s) - L + 1): | 240 | |
| 241 | sub = s[i:i+L] | 241 | |
| 242 | count[sub] = count.get(sub, 0) + 1 | 242 | |
| 243 | cand = [] | 243 | |
| 244 | for sub, c in count.items(): | 244 | |
| 245 | gain = c * (L - 1) - (L + 3) # rough net gain with rule cost | 245 | |
| 246 | if c >= 3 and gain > 0: | 246 | |
| 247 | cand.append((gain, c, sub)) | 247 | |
| 248 | cand.sort(reverse=True) | 248 | |
| 249 | for item in cand[:2]: | 249 | |
| 250 | best_subs.append(item) | 250 | |
| 251 | 251 | ||
| 252 | best_subs.sort(reverse=True) | 252 | |
| 253 | macros = [] | 253 | |
| 254 | used_tok = 0 | 254 | |
| 255 | cur = s | 255 | |
| 256 | 256 | ||
| 257 | for _, _, sub in best_subs: | 257 | |
| 258 | if used_tok >= len(tokens): | 258 | |
| 259 | break | 259 | |
| 260 | if sub not in cur: | 260 | |
| 261 | continue | 261 | |
| 262 | tok = tokens[used_tok] | 262 | |
| 263 | used_tok += 1 | 263 | |
| 264 | 264 | ||
| 265 | parts = [] | 265 | |
| 266 | i = 0 | 266 | |
| 267 | reps = 0 | 267 | |
| 268 | L = len(sub) | 268 | |
| 269 | while i < len(cur): | 269 | |
| 270 | if i + L <= len(cur) and cur[i:i+L] == sub: | 270 | |
| 271 | parts.append(tok) | 271 | |
| 272 | i += L | 272 | |
| 273 | reps += 1 | 273 | |
| 274 | else: | 274 | |
| 275 | parts.append(cur[i]) | 275 | |
| 276 | i += 1 | 276 | |
| 277 | if reps < 3: | 277 | |
| 278 | used_tok -= 1 | 278 | |
| 279 | continue | 279 | |
| 280 | new_cur = "".join(parts) | 280 | |
| 281 | if len(new_cur) + len(sub) + 3 >= len(cur): | 281 | |
| 282 | used_tok -= 1 | 282 | |
| 283 | continue | 283 | |
| 284 | macros.append((tok, sub)) | 284 | |
| 285 | cur = new_cur | 285 | |
| 286 | 286 | ||
| 287 | # escape cmd/sep/tokens if they somehow appear in cur (they shouldn't) | 287 | |
| 288 | header = ["Q", sep, str(len(macros)), sep] | 288 | |
| 289 | for tok, sub in macros: | 289 | |
| 290 | header.append(tok) | 290 | |
| 291 | header.append(sub) | 291 | |
| 292 | header.append(sep) | 292 | |
| 293 | return "".join(header) + cur, macros, cur | 293 | |
| 294 | 294 | ||
| 295 | def macro_decode(s): | 295 | |
| 296 | if not s or s[0] != "Q": | 296 | |
| 297 | return None | 297 | |
| 298 | if len(s) < 3: | 298 | |
| 299 | return None | 299 | |
| 300 | loc_sep = s[1] | 300 | |
| 301 | i = 2 | 301 | |
| 302 | j = i | 302 | |
| 303 | while j < len(s) and s[j] != loc_sep: | 303 | |
| 304 | if s[j] < "0" or s[j] > "9": | 304 | |
| 305 | return None | 305 | |
| 306 | j += 1 | 306 | |
| 307 | if j == i or j >= len(s): | 307 | |
| 308 | return None | 308 | |
| 309 | k = int(s[i:j]) | 309 | |
| 310 | i = j + 1 | 310 | |
| 311 | macros = [] | 311 | |
| 312 | for _ in range(k): | 312 | |
| 313 | if i >= len(s): | 313 | |
| 314 | return None | 314 | |
| 315 | tok = s[i] | 315 | |
| 316 | i += 1 | 316 | |
| 317 | j = i | 317 | |
| 318 | while j < len(s) and s[j] != loc_sep: | 318 | |
| 319 | j += 1 | 319 | |
| 320 | if j >= len(s): | 320 | |
| 321 | return None | 321 | |
| 322 | sub = s[i:j] | 322 | |
| 323 | macros.append((tok, sub)) | 323 | |
| 324 | i = j + 1 | 324 | |
| 325 | cur = s[i:] | 325 | |
| 326 | # stack-like repeated expansion in reverse declaration order | 326 | |
| 327 | for tok, sub in reversed(macros): | 327 | |
| 328 | parts = [] | 328 | |
| 329 | for ch in cur: | 329 | |
| 330 | if ch == tok: | 330 | |
| 331 | parts.append(sub) | 331 | |
| 332 | else: | 332 | |
| 333 | parts.append(ch) | 333 | |
| 334 | cur = "".join(parts) | 334 | |
| 335 | return cur | 335 | |
| 336 | 336 | ||
| 337 | enc4, _, _ = macro_encode(data) | 337 | |
| 338 | if verify(enc4, macro_decode): | 338 | |
| 339 | best = min(best, len(enc4)) | 339 | |
| 340 | 340 | ||
| 341 | ratio = best / n | 341 | |
| 342 | if ratio < 0: | 342 | |
| 343 | return 999.0 | 343 | |
| 344 | return ratio | 344 | |
| 345 | except: | 345 | |
| 346 | return 999.0 | 346 |
1def solve(input):2 try:3 data = input.get("data", "")4 if not isinstance(data, str):5 data = str(data)67 n = len(data)8 if n == 0:9 return 0.01011 # Novel approach:12 # Evaluate several robust, exactly-decodable text compressors and13 # return the best verified ratio. This avoids overfitting to one scheme.14 #15 # Schemes:16 # 1) Pure literal17 # 2) RLE with escape18 # 3) BPE-style dictionary substitution using rare private-use chars19 # 4) Queue/stack inspired macro system: repeated substring macros + escapes20 #21 # Ratio is measured in "encoded character count" / original character count.2223 used = set(data)2425 def verify(encoded, decoder):26 try:27 dec = decoder(encoded)28 return dec == data29 except:30 return False3132 best = float(n) # encoded size3334 # 1) Pure literal with one-char header35 # format: 'L' + data36 enc1 = "L" + data37 def dec1(s):38 if not s or s[0] != "L":39 return None40 return s[1:]41 if verify(enc1, dec1):42 best = min(best, len(enc1))4344 # choose escape chars from private use area if possible45 esc = None46 for cp in range(0xE000, 0xF8FF + 1):47 ch = chr(cp)48 if ch not in used:49 esc = ch50 break5152 # 2) RLE with escape encoding53 # raw char c is emitted as c unless c==esc, then esc esc54 # run len>=4 emitted as esc 'R' <count digits> esc <char>55 if esc is not None:56 def enc_rle(s):57 out = []58 i = 059 m = len(s)60 while i < m:61 j = i + 162 while j < m and s[j] == s[i]:63 j += 164 run = j - i65 ch = s[i]66 if run >= 4:67 out.append(esc)68 out.append("R")69 out.append(str(run))70 out.append(esc)71 out.append(ch)72 else:73 for _ in range(run):74 if ch == esc:75 out.append(esc)76 out.append(esc)77 else:78 out.append(ch)79 i = j80 return "".join(out)8182 def dec_rle(s):83 out = []84 i = 085 m = len(s)86 while i < m:87 ch = s[i]88 if ch != esc:89 out.append(ch)90 i += 191 else:92 if i + 1 >= m:93 return None94 t = s[i + 1]95 if t == esc:96 out.append(esc)97 i += 298 elif t == "R":99 k = i + 2100 if k >= m:101 return None102 start = k103 while k < m and s[k] != esc:104 if s[k] < "0" or s[k] > "9":105 return None106 k += 1107 if k == start or k + 1 >= m:108 return None109 cnt = int(s[start:k])110 out.extend(s[k + 1] for _ in range(cnt))111 i = k + 2112 else:113 return None114 return "".join(out)115116 enc2 = enc_rle(data)117 if verify(enc2, dec_rle):118 best = min(best, len(enc2))119120 # 3) BPE-style iterative pair substitution121 # Header:122 # M <sep> k <sep> [tok pair tok sep]* encoded_body123 # token chars are fresh private-use chars not in input.124 fresh = []125 for cp in range(0xE000, 0xF8FF + 1):126 ch = chr(cp)127 if ch not in used:128 fresh.append(ch)129 if len(fresh) >= 64:130 break131132 if len(fresh) >= 3:133 sep = fresh[0]134 available = fresh[1:]135136 def bpe_encode(s):137 rules = []138 cur = s139 avail_idx = 0140 # iterative best-pair replacement141 for _ in range(min(32, len(available))):142 freq = {}143 for i in range(len(cur) - 1):144 p = cur[i:i+2]145 freq[p] = freq.get(p, 0) + 1146 best_pair = None147 best_count = 0148 for p, c in freq.items():149 if c > best_count:150 best_pair = p151 best_count = c152 if best_pair is None or best_count < 3 or avail_idx >= len(available):153 break154 tok = available[avail_idx]155 avail_idx += 1156157 new_cur_parts = []158 i = 0159 changed = 0160 while i < len(cur):161 if i + 1 < len(cur) and cur[i:i+2] == best_pair:162 new_cur_parts.append(tok)163 i += 2164 changed += 1165 else:166 new_cur_parts.append(cur[i])167 i += 1168 new_cur = "".join(new_cur_parts)169170 # only keep if worthwhile after rule cost171 # rule storage cost about 4 chars: tok + pair + sep172 if len(new_cur) + len(rules) * 4 + 4 >= len(cur) + max(0, len(rules)-1) * 4 + 4:173 break174175 rules.append((tok, best_pair))176 cur = new_cur177178 header = ["M", sep, str(len(rules)), sep]179 for tok, pair in rules:180 header.append(tok)181 header.append(pair)182 header.append(sep)183 return "".join(header) + cur, rules, cur184185 def bpe_decode(s):186 if not s or s[0] != "M":187 return None188 if len(s) < 3:189 return None190 loc_sep = s[1]191 i = 2192 j = i193 while j < len(s) and s[j] != loc_sep:194 if s[j] < "0" or s[j] > "9":195 return None196 j += 1197 if j == i or j >= len(s):198 return None199 k = int(s[i:j])200 i = j + 1201 rules = []202 for _ in range(k):203 if i + 3 > len(s):204 return None205 tok = s[i]206 pair = s[i + 1:i + 3]207 i += 3208 if i > len(s) or i == len(s) or s[i] != loc_sep:209 return None210 i += 1211 rules.append((tok, pair))212 body = s[i:]213 cur = body214 for tok, pair in reversed(rules):215 parts = []216 for ch in cur:217 if ch == tok:218 parts.append(pair)219 else:220 parts.append(ch)221 cur = "".join(parts)222 return cur223224 enc3, _, _ = bpe_encode(data)225 if verify(enc3, bpe_decode):226 best = min(best, len(enc3))227228 # 4) Macro compression using repeated substrings and a stack/queue style expansion.229 # Build up to a few macros for profitable repeated substrings.230 if len(fresh) >= 8:231 sep = fresh[0]232 cmd = fresh[1]233 tokens = fresh[2:8]234235 def macro_encode(s):236 # Find profitable repeated substrings of length 3..8237 best_subs = []238 for L in range(3, min(9, len(s) + 1)):239 count = {}240 for i in range(len(s) - L + 1):241 sub = s[i:i+L]242 count[sub] = count.get(sub, 0) + 1243 cand = []244 for sub, c in count.items():245 gain = c * (L - 1) - (L + 3) # rough net gain with rule cost246 if c >= 3 and gain > 0:247 cand.append((gain, c, sub))248 cand.sort(reverse=True)249 for item in cand[:2]:250 best_subs.append(item)251252 best_subs.sort(reverse=True)253 macros = []254 used_tok = 0255 cur = s256257 for _, _, sub in best_subs:258 if used_tok >= len(tokens):259 break260 if sub not in cur:261 continue262 tok = tokens[used_tok]263 used_tok += 1264265 parts = []266 i = 0267 reps = 0268 L = len(sub)269 while i < len(cur):270 if i + L <= len(cur) and cur[i:i+L] == sub:271 parts.append(tok)272 i += L273 reps += 1274 else:275 parts.append(cur[i])276 i += 1277 if reps < 3:278 used_tok -= 1279 continue280 new_cur = "".join(parts)281 if len(new_cur) + len(sub) + 3 >= len(cur):282 used_tok -= 1283 continue284 macros.append((tok, sub))285 cur = new_cur286287 # escape cmd/sep/tokens if they somehow appear in cur (they shouldn't)288 header = ["Q", sep, str(len(macros)), sep]289 for tok, sub in macros:290 header.append(tok)291 header.append(sub)292 header.append(sep)293 return "".join(header) + cur, macros, cur294295 def macro_decode(s):296 if not s or s[0] != "Q":297 return None298 if len(s) < 3:299 return None300 loc_sep = s[1]301 i = 2302 j = i303 while j < len(s) and s[j] != loc_sep:304 if s[j] < "0" or s[j] > "9":305 return None306 j += 1307 if j == i or j >= len(s):308 return None309 k = int(s[i:j])310 i = j + 1311 macros = []312 for _ in range(k):313 if i >= len(s):314 return None315 tok = s[i]316 i += 1317 j = i318 while j < len(s) and s[j] != loc_sep:319 j += 1320 if j >= len(s):321 return None322 sub = s[i:j]323 macros.append((tok, sub))324 i = j + 1325 cur = s[i:]326 # stack-like repeated expansion in reverse declaration order327 for tok, sub in reversed(macros):328 parts = []329 for ch in cur:330 if ch == tok:331 parts.append(sub)332 else:333 parts.append(ch)334 cur = "".join(parts)335 return cur336337 enc4, _, _ = macro_encode(data)338 if verify(enc4, macro_decode):339 best = min(best, len(enc4))340341 ratio = best / n342 if ratio < 0:343 return 999.0344 return ratio345 except:346 return 999.01def solve(input):2 data = input.get("data", "")3 if not isinstance(data, str) or not data:4 return 999.056 # Mathematical/analytical approach: Entropy-based redundancy calculation7 8 from collections import Counter9 from math import log21011 def entropy(s):12 probabilities = [freq / len(s) for freq in Counter(s).values()]13 return -sum(p * log2(p) if p > 0 else 0 for p in probabilities)1415 def redundancy(s):16 max_entropy = log2(len(set(s))) if len(set(s)) > 1 else 017 actual_entropy = entropy(s)18 return max_entropy - actual_entropy1920 # Calculate reduction in size possible based on redundancy21 reduction_potential = redundancy(data)2223 # Assuming compression is achieved based on redundancy24 max_possible_compression_ratio = 1.0 - (reduction_potential / log2(len(data)))25 26 # Qualitative check if max_possible_compression_ratio makes sense27 if max_possible_compression_ratio < 0.0 or max_possible_compression_ratio > 1.0:28 return 999.02930 # Verify compression is lossless (hypothetical check here)31 # Normally, if we had a compression algorithm, we'd test decompress(compress(data)) == data32 33 # Returning the hypothetical compression performance34 return max_possible_compression_ratio