Sunday, May 02, 2010

Protocol Buffers v/s HTTP

I was discussing serialization costs with my immediate superior, Ramki when one thing lead to another and I landed up wanting to compare the serialization and deserialization costs of protocol buffers (a binary format) to those of HTTP (a text based protocol) in python.

After performing some tests (in python), I observed that protobuf was taking almost 4 times the amount of time to deserialize data as compared to the simplistic HTTP based header parsing. Of course, these 2 are meant for different purposes and Ramki mentioned that a fixed format protocol would save data on the wire since the attribute names (header names in HTTP) need not be sent on the wire; just the values (header values in HTTP) are sufficient.

Also a binary protocol should be much faster as far as serialization and deserialization is concerned, but we found out that python's pack and unpack are a bit slow OR it is blazingly fast at doing string operations.

Here is a representative output from one such run of the program below:

ramki serialization time: 0.125000 seconds
ramki deserialization time: 0.156000 seconds
protobuf serialization time: 0.453000 seconds
protobuf deserialization time: 0.453000 seconds
http serialization time: 0.047000 seconds
http deserialization time: 0.125000 seconds

The code:

import sys
import message_pb2
import time
import struct

def multi_serialize(o, n):
fragments = []
for i in xrange(0, n):
data = o.SerializeToString()
fragments.append("%d\r\n" % len(data))
fragments.append(data)
return "".join(fragments)

def multi_parse(input, n):
il = len(input)
start = 0
objects = []
for i in xrange(0, n):
rnPos = input.find("\r\n", start)
if rnPos == -1:
print "Premature end of input. Terminating..."
return None
lenStr = input[start:rnPos]
start = rnPos + 2
lenInt = int(lenStr)
# Read off lenInt bytes off the stream
data = input[start:start + lenInt]
start += lenInt
obj = message_pb2.Header()
obj.ParseFromString(data)
objects.append(obj)
return objects


def http_header_create(request, headers):
line1 = "GET %s HTTP/1.1" % request
hLines = [line1]
for k,v in headers.items():
hLines.append(k + ": " + v)
return "\r\n".join(hLines) + "\r\n\r\n"

def http_header_parse(input):
parts = input.split("\r\n")
line1 = tuple(parts[0].split())
headers = { }
for i in xrange(1, len(parts)):
h = parts[i].split(": ")
if len(h) == 2:
k,v = h
headers[k] = v
return (line1, headers)

def http_multi_serialize(request, headers, n):
fragments = []
for i in xrange(0, n):
fragments.append(http_header_create(request, headers))
return "".join(fragments)

def http_multi_parse(input, n):
il = len(input)
start = 0
objects = []
for i in xrange(0, n):
delimPos = input.find("\r\n\r\n", start)
if delimPos == -1:
print "Premature end of input. Terminating..."
return None
headerString = input[start:delimPos]
headerObject = http_header_parse(headerString)
objects.append(headerObject)
start = delimPos + 4
return objects

def ramki_serialize(obj):
totalLength = 0
attrs = [ ]
for k,v in obj.__dict__.items():
totalLength += (2 + len(v))
attr = struct.pack("H", len(v)) + v
attrs.append(attr)
attrs.insert(0, struct.pack("H", totalLength))
return "".join(attrs)

class RamkiDummy(object):
pass

shortStruct = struct.Struct("H")

def ramki_deserialize(input):
# For now, we lose attribute names
d = RamkiDummy()
packetLength = shortStruct.unpack(input[0:2])[0]
s = 2
ctr = 0
while s < packetLength+2:
# print "CTR: " + str(ctr)
attrLength = shortStruct.unpack(input[s:s+2])[0]
s += 2
# Read attrLength bytes of data
attrValue = input[s:s+attrLength]
s += attrLength
setattr(d, "attr" + str(ctr), attrValue)
ctr += 1

return d

def ramki_multi_serialize(obj, n):
stream = []
for i in xrange(0, n):
stream.append(ramki_serialize(obj))
return "".join(stream)

def ramki_multi_deserialize(input, n):
objects = []
s = 0
for i in xrange(0, n):
objectLength = shortStruct.unpack(input[s:s+2])[0] + 2
obj = ramki_deserialize(input[s:s+objectLength])
s += objectLength
objects.append(obj)
return objects

def main():

class Dummy(object):
pass

d = Dummy()
d.request = "GET"
d.resource = "/user/ramki/getVcard/"
d.version = "1.1"
d.destination = "localhost:8080"
d.custom1 = "434552"
d.custom2 = "no"

s = time.time()
input = ramki_multi_serialize(d, 10000)
print "ramki serialization time: %f seconds" % (time.time() - s)

s = time.time()
ramki_multi_deserialize(input, 10000)
print "ramki deserialization time: %f seconds" % (time.time() - s)

h = message_pb2.Header()
h.request = "GET"
h.resource = "/user/ramki/getVcard/"
h.version = "1.1"
h.destination = "localhost:8080"
h.custom1 = "434552"
h.custom2 = "no"

s = time.time()
stream = multi_serialize(h, 10000)
print "protobuf serialization time: %f seconds" % (time.time() - s)

s = time.time()
objs = multi_parse(stream, 10000)
print "protobuf deserialization time: %f seconds" % (time.time() - s)

hh = { "Host": "localhost",
"X-MessageID": "33",
"X-ACKMessage": "100",
}

s = time.time()
stream = http_multi_serialize("/user/ramki/getVcard/", hh, 10000)
print "http serialization time: %f seconds" % (time.time() - s)

s = time.time()
objs = http_multi_parse(stream, 10000)
print "http deserialization time: %f seconds" % (time.time() - s)

return 0

sys.exit(main())


This page claims that protobuff deserialization time is 3478ns whereas I am seeing a time of 4530ns which is expected since we are running on different hardware.

From: here, it seems as it protobuf is good at packing ints but not strings.

In fact, none of the deserialization times even come close to the 1250ns that I see for http parsing. This is mainly because those methods do type conversion which HTTP is not doing.
If that is introduced into the mix, I guess those costs will add up too. However, the application that I want it for doesn't really need it, and there will be many applications that don't.

In the link above, many of the methods, serialization takes more time than deserialization which is slightly curious.