Ultra-Fast Zero PNG
UltraFastZeroPNG.c - raw
/*
* UltraFastZeroPNG.c
*
* This C99 program makes a huge PNG very fast.
*
* Copyright (c) 2017-2019 Victor Zheng
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#define IMG_W (1 << 20)
#define IMG_H IMG_W
#include <stdio.h>
#include <stdint.h>
#include <time.h>
typedef uint8_t uchar;
typedef uint_fast8_t uint8;
typedef uint_fast16_t uint16;
typedef uint_fast32_t uint32;
typedef int_fast32_t int32;
uint32 crcTable[0x100];
void makeCRCtable()
{
#define POLY 0xedb88320 // reversed bit order
for (uint16 i = 0; i <= 0xFF; ++i) {
uint32 r = i;
for (uint8 j = 8; j; --j)
r = (r >> 1) ^ (POLY & (uint32)-((int32)r & 1));
crcTable[i] = r;
}
}
// For more speed, do the negations outside of these functions
uint32 crc32b(uint32 crc, const uchar c)
{
return crcTable[c ^ (crc & 0xFF)] ^ (crc >> 8);
}
uint32 crc32buf(uint32 crc, const uchar *buf, size_t len)
{
while (len--)
crc = crc32b(crc, *buf++);
return crc;
}
// matrix[i] is the ith column of the 32x32 matrix
// vec is a column vector
uint32 gf2_matrix_multiply(const uint32 matrix[32], uint32 vec)
{
uint32 r = 0;
for (const uint32 *m = matrix; vec; vec >>= 1, m++) {
if (vec & 1)
r ^= *m;
}
return r;
}
void gf2_matrix_square(uint32 dst[32], const uint32 src[32])
{
for (uint8 i = 0; i < 32; i++)
dst[i] = gf2_matrix_multiply(src, src[i]);
}
uint32 crc32zero(uint32 crc, size_t num)
{
uint32 matrix[2][32];
uint8 m = 0;
matrix[0][0] = POLY;
for (uint8 i = 1; i < 32; i++)
matrix[0][i] = 1 << (i - 1);
gf2_matrix_square(matrix[1], matrix[0]);
gf2_matrix_square(matrix[0], matrix[1]);
for (; num; num >>= 1) {
gf2_matrix_square(matrix[m ^ 1], matrix[m]);
m ^= 1;
if (num & 1)
crc = gf2_matrix_multiply(matrix[m], crc);
}
return crc;
}
void fwriteBytes(const uchar *buf, size_t len, uint32 *crc, FILE *f)
{
*crc = crc32buf(*crc, buf, len);
fwrite(buf, len, 1, f);
}
void fwriteZeros(size_t num, uint32 *crc, FILE *f)
{
*crc = crc32zero(*crc, num);
#if 1
// This works only if the system supports fseek past EOF
// and fills the gap with zero bytes.
fseek(f, num, SEEK_CUR);
#else
static const uchar zeros[0x10000]; // initialized to 0
// Write 64K blocks first
for (size_t i = num >> 16; i; --i)
fwrite(zeros, sizeof(zeros), 1, f);
// Write last block
num &= 0xFFFF;
fwrite(zeros, sizeof(char), num, f);
#endif
}
#if 0
size_t fwriteDEFLATEzeroFirstBlock(uint16 num, uint32 *crc, int last, FILE *f)
{
if (!num)
return 0;
uchar header[5] = {
(uchar)(last ? 1 : 0),
(uchar)(num >> 0),
(uchar)(num >> 8),
};
header[3] = ~header[1];
header[4] = ~header[2];
fwriteBytes(header, sizeof(header), crc, f);
fwriteZeros(num, crc, f);
return sizeof(header) + num;
}
#endif
size_t fwriteDEFLATEzeros(size_t num, uint32 *crc, FILE *f)
{
if (!num)
return 0;
// L tree
// 0: 285 - length 258
// 10: 0 - literal 0
// 11: 256 - end of block
// codelengths: 2, 0 * 138, 0 * 117, 2, 0 * 28, 1, (0...)
// D tree
// 0: 1
// 1: unused
// codelengths: 1, (0...)
// code length tree
// 0: 18
// 10: 1
// 11: 2
// codelength codelengths in special order
// 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
// 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, (0)
// ------------------------------
// Last block -> 1
// Dynamic Huffman block -> 10
// HLIT = 286-257 = 29 -> 11101
// HDIST = 1-1 = 0 -> 00000
// HCLEN = 18-4 = 14 -> 1110
// packed codelength codelengths:
// 000, 000, 001, 36 zero bits, 010, 000, 010
// packed L tree:
// 11, 0, 1111111, 0, 1101010, 11, 0, 0010001, 10 (reversed)
// packed D tree:
// 10 (reversed)
const uchar header[] = {
0xED, // 11101 10 1
0xC0, // 110 00000
0x81, // 1 000000 1-
0x00, // 000000 00-
0x00, // 00000000-
0x00, // 00000000-
0x00, // 00000000-
0x80, // 10 000000-
0xA0, // 1 010 000 0-
0xFD, // 111111 0 1-
0xA9, // 101010 0 1-
0x17, // 0001 0 11 1-
};
uint16 buf = 0x29; // 01 01 001-
uint8 bits = 7;
fwriteBytes(header, sizeof(header), crc, f);
size_t size = sizeof(header);
// If num is a multiple of 258, write 258 literals!
size_t q = (num - 1) / 258;
size_t r = ((num - 1) % 258) + 1;
#define WRITE(v, s) { \
buf |= v << bits; \
bits += s; \
if (bits >= 8) { \
*crc = crc32b(*crc, buf & 0xFF); \
fputc(buf & 0xFF, f); \
++size; \
buf >>= 8; \
bits -= 8; \
} \
}
while (r--) {
// write 10 (literal 0)
WRITE(1, 2);
}
while (q--) {
// write 00 (length 258, dist 1)
WRITE(0, 2);
if (!buf) {
// This is how to get high speed!
fwriteZeros(q >> 2, crc, f);
size += q >> 2;
// Use Duff's device instead of last 3 iterations
switch (q & 3) {
case 3: WRITE(0, 2);
case 2: WRITE(0, 2);
case 1: WRITE(0, 2);
}
break; // q = 0;
}
}
// write 11 (end of block)
WRITE(3, 2);
// flush
if (bits) { // (this is always true)
*crc = crc32b(*crc, buf & 0xFF);
fputc(buf & 0xFF, f);
++size;
}
return size;
}
size_t fwriteZLIBzeros(size_t num, uint32 *crc, FILE *f)
{
uchar header[2] = {
0x78, // 32K window size, DEFLATE
0x01, // "compressor used fastest algorithm"
// because recompression might be worthwhile
};
uint32 adler = ((num % 65521) << 16) | 1;
uchar trailer[4] = {
(uchar)(adler >> 24),
(uchar)(adler >> 16),
(uchar)(adler >> 8),
(uchar)(adler >> 0),
};
fwriteBytes(header, sizeof(header), crc, f);
size_t sizeDEFLATE = fwriteDEFLATEzeros(num, crc, f);
fwriteBytes(trailer, sizeof(trailer), crc, f);
return sizeof(header) + sizeDEFLATE + sizeof(trailer);
}
void packInt32be(uint32 i, uchar *buf)
{
buf[0] = (uchar)(i >> 24);
buf[1] = (uchar)(i >> 16);
buf[2] = (uchar)(i >> 8);
buf[3] = (uchar)(i >> 0);
}
int fwritePNG(uint32 width, uint32 height, FILE *f)
{
const size_t numBytes = (1 + (size_t)width) * height;
// Signature
fwrite("\x89PNG\r\n\x1A\n", sizeof(char), 8, f);
// IHDR
uchar IHDR[] = {
'I',
'H',
'D',
'R',
(width >> 24) & 0xFF,
(width >> 16) & 0xFF,
(width >> 8) & 0xFF,
(width >> 0) & 0xFF,
(height >> 24) & 0xFF,
(height >> 16) & 0xFF,
(height >> 8) & 0xFF,
(height >> 0) & 0xFF,
8, // bitdepth
0, // colortype
0, // compresstype=zlib (only supported method)
0, // filtermethod=adaptive (only supported method)
0, // interlaced=no
};
uchar i32[4];
packInt32be(13, i32); // 17-4
fwrite(i32, sizeof(i32), 1, f);
uint32 crc = 0xFFFFFFFF;
fwriteBytes(IHDR, sizeof(IHDR), &crc, f);
packInt32be(~crc, i32);
fwrite(i32, sizeof(i32), 1, f);
// IDAT
long IDAT_len_offset = ftell(f); // 33
fwrite("\0\0\0\0IDAT", sizeof(char), 8, f);
crc = 0xCA50F9E1; // ~CRC32("IDAT")
uint32 IDAT_len = (uint32)fwriteZLIBzeros(numBytes, &crc, f);
packInt32be(~crc, i32);
fwrite(i32, sizeof(i32), 1, f);
// IEND
// 0x00000000 + "IEND" + CRC32("IEND")
fwrite("\0\0\0\0IEND\xae\x42\x60\x82", sizeof(char), 12, f);
// Set IDAT length
fseek(f, IDAT_len_offset, SEEK_SET);
packInt32be(IDAT_len, i32);
fwrite(i32, sizeof(i32), 1, f);
return ferror(f);
}
int main()
{
makeCRCtable();
int ret = 0;
FILE *f = fopen("out.png", "wb");
if (!f)
return -1;
clock_t before = clock();
ret = fwritePNG(IMG_W, IMG_H, f);
clock_t after = clock();
fclose(f);
double diff = (double)(after - before) / CLOCKS_PER_SEC;
double rate = ((1 + IMG_W) * (double)IMG_H) / diff;
rate = rate * 8 / 1024 / 1024 / 1024 / 1024;
printf("Compressed at %.2f Tbit/s (%.9f s)\n", rate, diff);
return ret;
}