From: Mikko Rasa <tdb@tdb.fi>
Date: Thu, 2 Aug 2012 08:11:40 +0000 (+0300)
Subject: Use custom encoding for floats in binary format
X-Git-Url: http://git.tdb.fi/?a=commitdiff_plain;h=19179a622c1de88de5ed7047643eec79f285bf2a;p=libs%2Fdatafile.git

Use custom encoding for floats in binary format

This makes the binary format fully machine-independent, and provides
control over the precision of floating point values.
---

diff --git a/source/binaryparser.cpp b/source/binaryparser.cpp
index 81a86ea..fa99a7e 100644
--- a/source/binaryparser.cpp
+++ b/source/binaryparser.cpp
@@ -1,7 +1,9 @@
+#include <limits>
 #include <sys/param.h>
 #include <msp/core/maputils.h>
 #include <msp/strings/format.h>
 #include "binaryparser.h"
+#include "binfloat.h"
 #include "input.h"
 
 using namespace std;
@@ -22,10 +24,12 @@ public:
 
 BinaryParser::BinaryParser(Input &i, const string &s):
 	ParserMode(i, s),
-	first(true)
+	first(true),
+	float_precision(32)
 {
 	dict[-1] = DictEntry("__kwd", "iss");
 	dict[-2] = DictEntry("__str", "is");
+	dict[-3] = DictEntry("__flt", "i");
 }
 
 Statement BinaryParser::parse()
@@ -51,6 +55,8 @@ Statement BinaryParser::parse()
 			const unsigned id = st.args[0].get<unsigned>();
 			strings[id] = st.args[1].get<const string &>();
 		}
+		else if(st.keyword=="__flt")
+			float_precision = st.args[0].get<unsigned>();
 		else
 			return st;
 	}
@@ -127,21 +133,47 @@ IntType::Store BinaryParser::parse_int()
 
 FloatType::Store BinaryParser::parse_float()
 {
-	union
+	UInt64 encoded = 0;
+	for(unsigned i=0; i<float_precision; i+=8)
 	{
-		float f;
-		char d[sizeof(float)];
-	};
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-	for(unsigned i = sizeof(float); i--;)
-		d[i] = in.get();
-#else
-	for(unsigned i = 0; i<sizeof(float); ++i)
-		d[i] = in.get();
-#endif
-
-	return f;
+		int c = in.get();
+		encoded = (encoded<<8) | (c&0xFF);
+	}
+
+	BinFloat bf = BinFloat::explode(encoded, float_precision);
+
+	if(numeric_limits<FloatType::Store>::is_iec559)
+		return bf.compose_iec559<FloatType::Store>();
+	else
+	{
+		/* Put the float together with arithmetic since we don't know its
+		internal layout */
+		FloatType::Store f = 0;
+		if(bf.infinity)
+		{
+			if(numeric_limits<FloatType::Store>::has_infinity)
+				f = numeric_limits<FloatType::Store>::infinity();
+			else
+				f = numeric_limits<FloatType::Store>::max();
+		}
+		else
+		{
+			for(unsigned i=0; i<64; ++i)
+			{
+				f /= 2;
+				if(bf.mantissa&1)
+					f += 1;
+				bf.mantissa >>= 1;
+			}
+			for(int i=0; i<bf.exponent; ++i)
+				f *= 2;
+			for(int i=0; i>bf.exponent; --i)
+				f /= 2;
+		}
+		if(bf.sign)
+			f = -f;
+		return f;
+	}
 }
 
 BoolType::Store BinaryParser::parse_bool()
diff --git a/source/binaryparser.h b/source/binaryparser.h
index b10d7b8..b4fec6b 100644
--- a/source/binaryparser.h
+++ b/source/binaryparser.h
@@ -21,6 +21,7 @@ private:
 	Dictionary dict;
 	StringMap strings;
 	bool first;
+	unsigned float_precision;
 
 public:
 	BinaryParser(Input &i, const std::string &s);
diff --git a/source/binarywriter.cpp b/source/binarywriter.cpp
index 9c9c893..ffdc040 100644
--- a/source/binarywriter.cpp
+++ b/source/binarywriter.cpp
@@ -1,5 +1,7 @@
+#include <limits>
 #include <msp/core/maputils.h>
 #include "binarywriter.h"
+#include "binfloat.h"
 #include "statement.h"
 
 using namespace std;
@@ -10,10 +12,23 @@ namespace DataFile {
 BinaryWriter::BinaryWriter(IO::Base &o):
 	WriterMode(o),
 	next_kwd_id(1),
-	next_str_id(1)
+	next_str_id(1),
+	float_precision(32)
 {
 	dict[DictEntry("__kwd", "iss")] = -1;
 	dict[DictEntry("__str", "is")] = -2;
+	dict[DictEntry("__flt", "i")] = -3;
+}
+
+void BinaryWriter::set_float_precision(unsigned fp)
+{
+	if(fp<16 || fp>64 || fp%8)
+		throw invalid_argument("BinaryWriter::set_float_precision");
+	float_precision = fp;
+	Statement fst;
+	fst.keyword = "__flt";
+	fst.args.push_back(float_precision);
+	write_(fst);
 }
 
 void BinaryWriter::write(const Statement &st)
@@ -116,20 +131,44 @@ void BinaryWriter::write_string(const StringType::Store &s)
 
 void BinaryWriter::write_float(FloatType::Store f)
 {
-	union
+	BinFloat bf;
+
+	if(numeric_limits<FloatType::Store>::is_iec559)
+		bf = BinFloat::explode_iec559(f);
+	else
 	{
-		float v;
-		char d[sizeof(float)];
-	};
-
-	v = f;
-#if BYTE_ORDER == LITTLE_ENDIAN
-	for(unsigned i = sizeof(float); i--;)
-		out.put(d[i]);
-#else
-	for(unsigned i = 0; i<sizeof(float); ++i)
-		out.put(d[i]);
-#endif
+		/* The structure of the float is unknown, so we must use arithmetic to
+		reduce it to components. */
+		bf.sign = f<0;
+		bf.exponent = 0;
+		bf.mantissa = 0;
+
+		if(f<0)
+			f = -f;
+		if(!(f+f>f))
+			bf.infinity = true;
+		else if(f!=0)
+		{
+			for(; f<1; f*=2)
+				--bf.exponent;
+			for(; f>=2; f/=2)
+				++bf.exponent;
+			for(unsigned i=0; i<64; ++i)
+			{
+				bf.mantissa <<= 1;
+				if(f>=1)
+				{
+					bf.mantissa |= 1;
+					f -= 1;
+				}
+				f *= 2;
+			}
+		}
+	}
+
+	UInt64 encoded = bf.compose(float_precision);
+	for(unsigned i=float_precision/8; i--; )
+		out.put((encoded>>(i*8))&0xFF);
 }
 
 void BinaryWriter::write_symbol(const SymbolType::Store &s)
diff --git a/source/binarywriter.h b/source/binarywriter.h
index 1a11ae5..2f67bbd 100644
--- a/source/binarywriter.h
+++ b/source/binarywriter.h
@@ -22,10 +22,12 @@ private:
 	unsigned next_kwd_id;
 	StringMap strings;
 	unsigned next_str_id;
+	unsigned float_precision;
 
 public:
 	BinaryWriter(IO::Base &o);
 
+	virtual void set_float_precision(unsigned);
 	virtual void write(const Statement &st);
 private:
 	void write_(const Statement &st);
diff --git a/source/binfloat.cpp b/source/binfloat.cpp
new file mode 100644
index 0000000..fa4be73
--- /dev/null
+++ b/source/binfloat.cpp
@@ -0,0 +1,72 @@
+#include <cmath>
+#include "binfloat.h"
+
+using namespace std;
+
+
+namespace Msp {
+namespace DataFile {
+
+BinFloat BinFloat::explode(UInt64 value, const Bits &bits)
+{
+	UInt64 mantissa_mask = (UInt64(1)<<bits.mantissa)-1;
+	int exponent_mask = (1<<bits.exponent)-1;
+
+	BinFloat bf;
+	// Extract biased exponent and sign
+	bf.exponent = (value>>bits.mantissa)&exponent_mask;
+	bf.sign = value>>(bits.mantissa+bits.exponent);
+	bf.infinity = (bf.exponent==exponent_mask);
+
+	if(bf.exponent==0 || bf.infinity)
+		// Zeroes and infinities have zero mantissa
+		bf.mantissa = 0;
+	else
+	{
+		// Extract mantissa, add the implied one and align it to high bits
+		bf.mantissa = (value&mantissa_mask) | (UInt64(1)<<bits.mantissa);
+		bf.mantissa <<= 63-bits.mantissa;
+	}
+
+	// Unbias the exponent
+	bf.exponent -= exponent_mask>>1;
+
+	return bf;
+}
+
+UInt64 BinFloat::compose(const Bits &bits)
+{
+	UInt64 mantissa_mask = (UInt64(1)<<bits.mantissa)-1;
+	int exponent_mask = (1<<bits.exponent)-1;
+
+	int biased_exponent = exponent+(exponent_mask>>1);
+	// Shift down and round the mantissa
+	UInt64 rounded_mantissa = ((mantissa>>(62-bits.mantissa))+1)>>1;
+	// If the integer part is greater than one, we need to use a higher exponent
+	if((rounded_mantissa>>bits.mantissa)>1)
+		++biased_exponent;
+
+	if(biased_exponent>=exponent_mask || infinity)
+		// Overflow, return infinity
+		return UInt64(sign<<bits.exponent | exponent_mask)<<bits.mantissa;
+	else if(biased_exponent<=0 || !mantissa)
+		// Underflow, return zero
+		return 0;
+	else
+	{
+		UInt64 value = rounded_mantissa&mantissa_mask;
+		value |= UInt64(biased_exponent)<<bits.mantissa;
+		value |= UInt64(sign)<<(bits.mantissa+bits.exponent);
+		return value;
+	}
+}
+
+
+// exponent = log_2(bits)*3-7
+BinFloat::Bits::Bits(unsigned bits):
+	exponent(log(bits)*4.3281-7),
+	mantissa(bits-exponent-1)
+{ }
+
+} // namespace DataFile
+} // namespace Msp
diff --git a/source/binfloat.h b/source/binfloat.h
new file mode 100644
index 0000000..d16570f
--- /dev/null
+++ b/source/binfloat.h
@@ -0,0 +1,63 @@
+#ifndef MSP_DATAFILE_BINFLOAT_H_
+#define MSP_DATAFILE_BINFLOAT_H_
+
+#include "type.h"
+
+namespace Msp {
+namespace DataFile {
+
+/**
+Facilitates splitting floating-point numbers into parts and putting them back
+together.  Supports arbitary sizes up to 64 bits.  The 16, 32 and 64 bit
+formats exactly match those defined by ISO/IEC 60559:2011.
+
+The exponent is stored in an unbiased form.  The mantissa is stored with the
+integer part included, aligned to the high bits of a 64-bit integer.
+*/
+struct BinFloat
+{
+	struct Bits
+	{
+		unsigned exponent;
+		unsigned mantissa;
+
+		Bits(unsigned);
+	};
+
+	template<typename T>
+	union Conversion
+	{
+		T f;
+		typename MatchingInt<T>::UnsignedType i;
+	};
+
+	bool sign;
+	bool infinity;
+	int exponent;
+	UInt64 mantissa;
+
+	static BinFloat explode(UInt64, const Bits &);
+
+	template<typename T>
+	static BinFloat explode_iec559(T v)
+	{
+		Conversion<T> c;
+		c.f = v;
+		return explode(c.i, sizeof(T)*CHAR_BIT);
+	}
+
+	UInt64 compose(const Bits &);
+
+	template<typename T>
+	T compose_iec559()
+	{
+		Conversion<T> c;
+		c.i = compose(sizeof(T)*CHAR_BIT);
+		return c.f;
+	}
+};
+
+} // namespace DataFile
+} // namespace Msp
+
+#endif
diff --git a/source/textwriter.cpp b/source/textwriter.cpp
index 0e5b674..65f4870 100644
--- a/source/textwriter.cpp
+++ b/source/textwriter.cpp
@@ -9,9 +9,15 @@ namespace Msp {
 namespace DataFile {
 
 TextWriter::TextWriter(IO::Base &o):
-	WriterMode(o)
+	WriterMode(o),
+	float_format("%#.7g")
 { }
 
+void TextWriter::set_float_precision(unsigned fp)
+{
+	float_format = format("%%#.%dg", fp/4-1);
+}
+
 void TextWriter::write(const Statement &st)
 {
 	write_(st, 0);
@@ -32,7 +38,7 @@ void TextWriter::write_(const Statement &st, unsigned level)
 		else if(i->get_signature()==IntType::signature)
 			out.write(lexical_cast(i->get<IntType::Store>()));
 		else if(i->get_signature()==FloatType::signature)
-			out.write(format("%15g", (i->get<FloatType::Store>())));
+			out.write(format(float_format, i->get<FloatType::Store>()));
 		else if(i->get_signature()==SymbolType::signature)
 		{
 			string name = i->get<SymbolType::Store>().name;
diff --git a/source/textwriter.h b/source/textwriter.h
index ad127de..a0ed1b7 100644
--- a/source/textwriter.h
+++ b/source/textwriter.h
@@ -8,9 +8,13 @@ namespace DataFile {
 
 class TextWriter: public WriterMode
 {
+private:
+	std::string float_format;
+
 public:
 	TextWriter(IO::Base &o);
 
+	virtual void set_float_precision(unsigned);
 	virtual void write(const Statement &st);
 private:
 	void write_(const Statement &st, unsigned);
diff --git a/source/writer.cpp b/source/writer.cpp
index c6a1e12..6091409 100644
--- a/source/writer.cpp
+++ b/source/writer.cpp
@@ -46,5 +46,10 @@ void Writer::set_binary(bool b)
 		mode = new TextWriter(out);
 }
 
+void Writer::set_float_precision(unsigned fp)
+{
+	mode->set_float_precision(fp);
+}
+
 } // namespace DataFile
 } // namespace Msp
diff --git a/source/writer.h b/source/writer.h
index 159bf95..c8d75e3 100644
--- a/source/writer.h
+++ b/source/writer.h
@@ -38,6 +38,11 @@ public:
 	@param  b  true for binary mode, false for text
 	*/
 	void set_binary(bool b);
+
+	/** Sets the precision of floating point numbers in bits.  Depending on the
+	mode not all values may be valid, but any value between 16 and 64 that is
+	divisible by 8 is guaranteed to work. */
+	void set_float_precision(unsigned);
 };
 
 } // namespace DataFile
diff --git a/source/writermode.h b/source/writermode.h
index 3778db6..21df83f 100644
--- a/source/writermode.h
+++ b/source/writermode.h
@@ -17,6 +17,7 @@ protected:
 public:
 	virtual ~WriterMode() { }
 
+	virtual void set_float_precision(unsigned) = 0;
 	virtual void write(const Statement &st) = 0;
 };
 
diff --git a/tool/tool.cpp b/tool/tool.cpp
index 4e66087..5f9febe 100644
--- a/tool/tool.cpp
+++ b/tool/tool.cpp
@@ -15,11 +15,13 @@ DataTool::DataTool(int argc, char **argv):
 	in_fn("-"),
 	out_fn("-"),
 	binary(false),
-	compile(false)
+	compile(false),
+	float_size(0)
 {
 	GetOpt getopt;
 	getopt.add_option('b', "binary", binary, GetOpt::NO_ARG);
 	getopt.add_option('c', "compile", compile, GetOpt::NO_ARG);
+	getopt.add_option('f', "float-size", float_size, GetOpt::REQUIRED_ARG);
 	getopt.add_option('o', "output", out_fn, GetOpt::REQUIRED_ARG);
 	getopt(argc, argv);
 
@@ -49,6 +51,8 @@ int DataTool::main()
 		DataFile::Writer writer(out_buf);
 		if(binary)
 			writer.set_binary(true);
+		if(float_size)
+			writer.set_float_precision(float_size);
 
 		if(compile)
 		{
diff --git a/tool/tool.h b/tool/tool.h
index a43fbfb..43c82e1 100644
--- a/tool/tool.h
+++ b/tool/tool.h
@@ -11,6 +11,7 @@ private:
 	std::string out_fn;
 	bool binary;
 	bool compile;
+	unsigned float_size;
 
 public:
 	DataTool(int argc, char **argv);