From c51d5b2b2ad060e66f484987f9c86be875d4d001 Mon Sep 17 00:00:00 2001 From: brianpzaide Date: Fri, 8 Nov 2024 21:25:37 +0000 Subject: [PATCH] feat: implementing Huffman coding --- compression/compression.go | 237 ++++++++++++++++++++++++++++++++++++- compression/minpq.go | 89 ++++++++++++++ 2 files changed, 324 insertions(+), 2 deletions(-) create mode 100644 compression/minpq.go diff --git a/compression/compression.go b/compression/compression.go index c9e7217..879ca08 100644 --- a/compression/compression.go +++ b/compression/compression.go @@ -1,9 +1,242 @@ package compression +import ( + "bytes" + "fmt" + "log" + "strings" + "unicode/utf8" + + "encoding/binary" +) + +// represents a node in a Hufmann Tree, holds character and it's number of occurances. +type HNode struct { + char rune + freq int + left, right *HNode +} + +// implements the interface Orderable +func (hn *HNode) Less(other Orderable) bool { + otherhn, ok := other.(*HNode) + if !ok { + log.Printf("Type assertion failed: expected *HNode, got %T", other) + return false + } + return hn.freq < otherhn.freq +} + +// builds HNodes and puts them in a minimum priority queue +func buildHNodesPQ(chars []rune) *MinPQ[Orderable] { + charFreq, hNodesPQ := make(map[rune]int), NewMinPQ[Orderable]() + + for _, r := range chars { + charFreq[r] += 1 + } + + for k, v := range charFreq { + hf := &HNode{ + char: k, + freq: v, + } + hNodesPQ.Insert(hf) + } + + return hNodesPQ +} + +// builds a Hufmann Tree by picking the HNode having the least character frequency +func buildHuffmannTree(hfNodesPQ *MinPQ[Orderable]) *HNode { + var a, b *HNode + for !hfNodesPQ.IsEmpty() { + temp, err := hfNodesPQ.DeleteMin() + a = temp.(*HNode) + if err != nil { + log.Fatalln(err.Error()) + } + if hfNodesPQ.IsEmpty() { + break + } else { + temp, err := hfNodesPQ.DeleteMin() + b = temp.(*HNode) + if err != nil { + log.Fatalln(err.Error()) + } + } + c := &HNode{ + freq: a.freq + b.freq, + left: a, + right: b, + } + hfNodesPQ.Insert(c) + } + + return a +} + +// builds the mapping to encode the data. CodeCharMap maps the character to the Hufmann Code generated using the Hufmann Tree +func buildCodeCharMap(root *HNode) map[string]rune { + codeCharMap := make(map[string]rune) + var buildCodes func(node *HNode, codeBuffer *bytes.Buffer) + buildCodes = func(node *HNode, codeBuffer *bytes.Buffer) { + if node.left == nil && node.right == nil { + codeCharMap[codeBuffer.String()] = node.char + return + } + if node.left != nil { + codeBuffer.WriteByte('0') + buildCodes(node.left, codeBuffer) + codeBuffer.Truncate(codeBuffer.Len() - 1) + } + if node.right != nil { + codeBuffer.WriteByte('1') + buildCodes(node.right, codeBuffer) + codeBuffer.Truncate(codeBuffer.Len() - 1) + } + } + + if root.left == nil && root.right == nil { + codeCharMap["0"] = root.char + return codeCharMap + } + var codeBuffer bytes.Buffer + buildCodes(root, &codeBuffer) + return codeCharMap +} + +// Serializes the mapping used for encoding the data. Formatting: for each character and it's Hufmann code, +func serializeCodeCharMap(codeCharMap map[string]rune) ([]byte, error) { + var encodedBytesBuffer bytes.Buffer + + for k, v := range codeCharMap { + keyLen := len(k) + valLen := utf8.RuneLen(v) + if valLen == -1 { + return nil, fmt.Errorf("invalid UTF-8 rune: %v", v) + } + + // write key length and value length as single bytes + encodedBytesBuffer.WriteByte(byte(keyLen)) + encodedBytesBuffer.WriteByte(byte(valLen)) + + // write the key (code) and character + encodedBytesBuffer.WriteString(k) + encodedBytesBuffer.WriteRune(v) + } + + return encodedBytesBuffer.Bytes(), nil +} + +// Reconstructs the mapping used for encoding the data. +func deserializeCodeCharMap(encodedBytesBuffer *bytes.Buffer, encodingMappingLen int32) (map[string]rune, error) { + codeCharMap := make(map[string]rune) + n := 0 + + for n < int(encodingMappingLen) { + // Read the key length (uint8) + keyLenByte, err := encodedBytesBuffer.ReadByte() + if err != nil { + return nil, fmt.Errorf("failed to read key length: %v", err) + } + n += 1 + keyLen := int(keyLenByte) + + // Read the value length (uint8) + valLenByte, err := encodedBytesBuffer.ReadByte() + if err != nil { + return nil, fmt.Errorf("failed to read value length: %v", err) + } + n += 1 + valLen := int(valLenByte) + + // Read the key + key := make([]byte, keyLen) + if _, err := encodedBytesBuffer.Read(key); err != nil { + return nil, fmt.Errorf("failed to read key: %v", err) + } + n += keyLen + // Read the value (should be one rune) + valueBytes := make([]byte, valLen) + if _, err := encodedBytesBuffer.Read(valueBytes); err != nil { + return nil, fmt.Errorf("failed to read value: %v", err) + } + value, size := utf8.DecodeRune(valueBytes) + if size == 0 || value == utf8.RuneError { + log.Printf("Warning: invalid UTF-8 rune detected at position %d", n) + return nil, fmt.Errorf("invalid UTF-8 rune detected") + } + n += valLen + // Add to map + codeCharMap[string(key)] = value + } + + return codeCharMap, nil +} + +// encode the given string returns a string in the format: func Encode(s string) string { - return s + chars := []rune(s) + pq := buildHNodesPQ(chars) + root := buildHuffmannTree(pq) + + // codeCharMap will be used for decoding the string + codeCharMap := buildCodeCharMap(root) + + var encodedBytesBuffer bytes.Buffer + encodingMappingBytes, err := serializeCodeCharMap(codeCharMap) + if err != nil { + log.Fatalln(err.Error()) + } + + // writing the length of the mapping used for encoding the given string s + err = binary.Write(&encodedBytesBuffer, binary.BigEndian, int32(len(encodingMappingBytes))) + if err != nil { + log.Fatalln("Error occured in writing the length of the encoding mapping to the buffer:", err) + } + // writing the encoding mapping itself + encodedBytesBuffer.Write(encodingMappingBytes) + + // charCodeMap will be used for encoding the string + charCodeMap := make(map[rune]string) + for k, v := range codeCharMap { + charCodeMap[v] = k + } + + for _, el := range chars { + encodedBytesBuffer.WriteString(charCodeMap[el]) + } + + return encodedBytesBuffer.String() } +// decode the given string func Decode(s string) string { - return s + encodedDataBytes := bytes.NewBuffer([]byte(s)) + var encodingMappingLen int32 + err := binary.Read(encodedDataBytes, binary.BigEndian, &encodingMappingLen) + if err != nil { + log.Fatalln("Error reading encoding mapping length:", err) + } + + codeCharMap, err := deserializeCodeCharMap(encodedDataBytes, int32(encodingMappingLen)) + if err != nil { + log.Fatalln(err.Error()) + } + + var sofar bytes.Buffer + var decodedStringBuilder strings.Builder + + for encodedDataBytes.Len() > 0 { + k, err := encodedDataBytes.ReadByte() + if err != nil { + log.Fatalln(err.Error()) + } + sofar.WriteByte(k) + if v, ok := codeCharMap[sofar.String()]; ok { + decodedStringBuilder.WriteRune(v) + sofar.Reset() + } + } + return decodedStringBuilder.String() } diff --git a/compression/minpq.go b/compression/minpq.go new file mode 100644 index 0000000..2a266ef --- /dev/null +++ b/compression/minpq.go @@ -0,0 +1,89 @@ +package compression + +import ( + "fmt" +) + +type Orderable interface { + Less(other Orderable) bool +} + +type MinPQ[T Orderable] struct { + arr []T +} + +func NewMinPQ[T Orderable]() *MinPQ[T] { + return &MinPQ[T]{ + arr: make([]T, 0), + } +} + +func (pq *MinPQ[T]) IsEmpty() bool { + return len(pq.arr) == 0 +} + +func (pq *MinPQ[T]) Size() int { + return len(pq.arr) +} + +func (pq *MinPQ[T]) Min() (T, error) { + if len(pq.arr) == 0 { + var zero T + return zero, fmt.Errorf("priority queue is empty") + } + return pq.arr[0], nil +} + +func (pq *MinPQ[T]) swap(a, b int) { + pq.arr[a], pq.arr[b] = pq.arr[b], pq.arr[a] +} + +func (pq *MinPQ[T]) rise(k int) { + for k > 0 { + parent := (k - 1) / 2 + if !pq.arr[k].Less(pq.arr[parent]) { + break + } + pq.swap(k, parent) + k = parent + } +} + +func (pq *MinPQ[T]) sink(k int) { + n := len(pq.arr) + for 2*k+1 < n { + left := 2*k + 1 + right := left + 1 + smallest := left + + if right < n && pq.arr[right].Less(pq.arr[left]) { + smallest = right + } + + if !pq.arr[smallest].Less(pq.arr[k]) { + break + } + + pq.swap(k, smallest) + k = smallest + } +} + +func (pq *MinPQ[T]) Insert(x T) { + pq.arr = append(pq.arr, x) + pq.rise(len(pq.arr) - 1) +} + +func (pq *MinPQ[T]) DeleteMin() (T, error) { + if len(pq.arr) == 0 { + var zero T + return zero, fmt.Errorf("priority queue is empty") + } + x := pq.arr[0] + pq.arr[0] = pq.arr[len(pq.arr)-1] + pq.arr = pq.arr[:len(pq.arr)-1] + + pq.sink(0) + + return x, nil +}