You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

100 lines
2.7 KiB
Java

import java.io.FileInputStream;
import java.util.*;
public class Lzw {
static final char END = (char) 0xFFFF;
public static int[] lzwEncoding(String s) {
List<Integer> result = new ArrayList<>();
Map<String, Integer> dict = new HashMap<>();
int cnt = 0;
for (int i = 0; i < 256; i++)
dict.put(String.valueOf((char) i), cnt++);
dict.put(String.valueOf(END), cnt++);
String cur = "";
for (char ch : s.toCharArray()) {
cur += ch;
if (!dict.containsKey(cur)) {
Integer e = dict.get(cur.substring(0, cur.length() - 1));
result.add(e);
dict.put(cur, cnt++);
cur = String.valueOf(ch);
}
}
result.add(dict.get(cur));
result.add(dict.get(String.valueOf(END)));
int[] res = new int[result.size()];
for (int i = 0; i < res.length; i++)
res[i] = result.get(i);
System.out.printf("Dict items: %,d\n", dict.size());
// System.out.printf("Dict size: %,d\n", SizingAgent.deepSizeOf(dict));
return res;
}
public static String lzwDecoding(int[] s) {
StringBuilder sb = new StringBuilder();
Map<Integer, String> dict = new HashMap<>();
int cnt = 0;
for (int i = 0; i < 256; i++)
dict.put(cnt++, String.valueOf((char) i));
dict.put(cnt++, String.valueOf(END));
String cur = dict.get(s[0]);
sb.append(cur);
char a = cur.charAt(0);
for (int i = 1; i < s.length; i++) {
int code = s[i];
if (code < cnt) {
String str = dict.get(code);
a = str.charAt(0);
if (a == END)
break;
dict.put(cnt++, cur + a);
sb.append(str);
cur = str;
} else {
cur += a;
dict.put(cnt++, cur);
sb.append(cur);
}
}
return sb.toString();
}
// Usage example
public static void main(String[] args) throws Exception {
FileInputStream fs = new FileInputStream("src/Lzw.java");
byte[] data = new byte[1_000_000];
int len = fs.read(data, 0, data.length);
data = Arrays.copyOf(data, len);
char[] buffer = new char[len];
for (int i = 0; i < len; i++)
buffer[i] = (char) (Byte.toUnsignedInt(data[i]));
String s = new String(buffer, 0, len);
int[] encoded = lzwEncoding(s);
String s2 = lzwDecoding(encoded);
byte[] data2 = new byte[len];
for (int i = 0; i < len; i++)
data2[i] = (byte) s2.charAt(i);
System.out.println(Arrays.equals(data, data2));
Locale.setDefault(Locale.US);
System.out.printf("%d -> %.0f\n", s.length(), optimalCompressedLength(encoded));
}
static double optimalCompressedLength(int[] a) {
int max = 0;
for (int x : a)
max = Math.max(max, x);
int[] freq = new int[max + 1];
for (int x : a)
++freq[x];
double optimalLength = 0;
for (int f : freq)
if (f > 0)
optimalLength += f * Math.log((double) a.length / f) / Math.log(2) / 8;
return optimalLength;
}
}