htdig3-dev Patch for htcommon/DocumentRef.cc: Don't store stuff in the database needlessly as int


Hans-Peter Nilsson (hans-peter.nilsson@axis.com)
Sat, 16 Jan 1999 04:15:08 +0100


When looking into how to cut down the database size, I tripped
over an opportunity for a naive optimization: all numeric values
and lengths are currently stored as "int"s, even when there
should be no need to.

This patch does two good things IMHO:

* Does not save the length or numeric value of stuff being put in
  the database as "int", it very seldom needs more than an
  unsigned char (but unsigned short also handled). Code the
  size in "unused" bits of the type-byte preceding the value.
   This should not be ABI-specific; if char == short or short ==
  int, the respective optimization will essentially be a noop.

* As a side-effect, cleans up the "get.." and "add..." macros
  (still macros for this release, but should be inline
  functions) so they are legible. At least here it was a
  problem, using the canonical tab-size of 8 spaces.

Hmm, I don't know which item is the best :-)

It seems it would save a lot of space for the stuff we put in
the database, but still I get only about 2 % smaller database
compared to before on a very simple test (all without
zlib-compression). I'll test more.

Note that this patch does not break "compatibility" with older
databases; you will be able to merge with them as before, but
the merged databases will of course not be backward compatible.

You will get two (?) more (identical) compilation warnings about
"converting from int to enum ReferenceState", which is just a
nit, but I'll try to remove them altogether (not that these were
the only warnings, but the fewer spurious warnings, the better).

It would be nice if this could get into the release; I think I
was still on time with this one.

Fri Jan 15 07:23:30 1999 Hans-Peter Nilsson <hp@axis.se>

        * htcommon/DocumentRef.cc DocumentRef::Serialize(String &s):
        Save space when lengths can fit in an unsigned char or unsigned
        short.
        * DocumentRef::Deserialize(String &stream): Handle expansion.

Index: htcommon/DocumentRef.cc
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htcommon/DocumentRef.cc,v
retrieving revision 1.16
diff -p -c -r1.16 DocumentRef.cc
*** DocumentRef.cc 1999/01/14 03:06:26 1.16
--- DocumentRef.cc 1999/01/16 02:06:03
*************** enum
*** 144,149 ****
--- 144,152 ----
      DOC_SIG // 19
  };
  
+ // Must be powers of two never reached by the DOC_... enums.
+ #define CHARSIZE_MARKER_BIT 64
+ #define SHORTSIZE_MARKER_BIT 128
  
  //*****************************************************************************
  // void DocumentRef::Serialize(String &s)
*************** void DocumentRef::Serialize(String &s)
*** 161,191 ****
  // value for this class, it it NOT serialized. This means that
  // storage will be saved...
  //
! #define addnum(id, out, var) if (var != 0) \
! { \
! out << (char) id; \
! out.append((char *) &var, sizeof(var)); \
! }
! #define addstring(id, out, str) if (str.length()) \
! { \
! length = str.length(); \
! out << (char) id; \
! out.append((char *) &length, sizeof(length)); \
! out.append(str); \
! }
! #define addlist(id, out, list) if (list.Count()) \
! { \
! length = list.Count(); \
! out << (char) id; \
! out.append((char *) &length, sizeof(length)); \
! list.Start_Get(); \
! while ((str = (String *) list.Get_Next())) \
! { \
! length = str->length(); \
! out.append((char*) &length, sizeof(length));\
! out.append(*str); \
! } \
! }
  
      addnum(DOC_ID, s, docID);
      addnum(DOC_TIME, s, docTime);
--- 164,272 ----
  // value for this class, it it NOT serialized. This means that
  // storage will be saved...
  //
! #define addnum(id, out, var) \
! if (var != 0) \
! { \
! if (var <= (unsigned char) ~1) \
! { \
! unsigned char _tmp = var; \
! out << (char) (id | CHARSIZE_MARKER_BIT); \
! out.append((char *) &_tmp, sizeof(_tmp)); \
! } \
! else if (var <= (unsigned short int) ~1) \
! { \
! unsigned short int _tmp = var; \
! out << (char) (id | SHORTSIZE_MARKER_BIT); \
! out.append((char *) &_tmp, sizeof(_tmp)); \
! } \
! else \
! { \
! out << (char) id; \
! out.append((char *) &var, sizeof(var)); \
! } \
! }
!
! #define addstring(id, out, str) \
! if (str.length()) \
! { \
! length = str.length(); \
! if (length <= (unsigned char) ~1) \
! { \
! unsigned char _tmp = length; \
! out << (char) (id | CHARSIZE_MARKER_BIT); \
! out.append((char *) &_tmp, sizeof(_tmp)); \
! } \
! else if (length <= (unsigned short int) ~1) \
! { \
! unsigned short int _tmp = length; \
! out << (char) (id | SHORTSIZE_MARKER_BIT); \
! out.append((char *) &_tmp, sizeof(_tmp)); \
! } \
! else \
! { \
! out << (char) id; \
! out.append((char *) &length, sizeof(length)); \
! } \
! out.append(str); \
! }
!
! // To keep compatibility with old databases, don't bother
! // with long lists at all. Bloat the size for long strings with
! // one char to just keep a ~1 marker since we don't know the
! // endianness; we don't know where to put a endian-safe
! // size-marker, and we probably rather want the full char to
! // keep the length. Only strings shorter than (unsigned char) ~1
! // will be "optimized"; trying to optimize strings that fit in
! // (unsigned short) does not seem to give anything substantial.
! #define addlist(id, out, list) \
! if (list.Count()) \
! { \
! length = list.Count(); \
! if (length <= (unsigned short int) ~1) \
! { \
! if (length <= (unsigned char) ~1) \
! { \
! unsigned char _tmp = length; \
! out << (char) (id | CHARSIZE_MARKER_BIT); \
! out.append((char *) &_tmp, sizeof(_tmp)); \
! } \
! else \
! { \
! unsigned short int _tmp = length; \
! out << (char) (id | SHORTSIZE_MARKER_BIT); \
! out.append((char *) &_tmp, sizeof(_tmp)); \
! } \
! list.Start_Get(); \
! while ((str = (String *) list.Get_Next())) \
! { \
! length = str->length(); \
! if (length < (unsigned char) ~1) \
! { \
! unsigned char _tmp = length; \
! out.append((char*) &_tmp, sizeof(_tmp)); \
! } \
! else \
! { \
! unsigned char _tmp = ~1; \
! out.append((char*) &_tmp, sizeof(_tmp)); \
! out.append((char*) &length, sizeof(length)); \
! } \
! out.append(*str); \
! } \
! } \
! else \
! { \
! out << (char) id; \
! out.append((char *) &length, sizeof(length)); \
! list.Start_Get(); \
! while ((str = (String *) list.Get_Next())) \
! { \
! length = str->length(); \
! out.append((char*) &length, sizeof(length)); \
! out.append(*str); \
! } \
! } \
! }
  
      addnum(DOC_ID, s, docID);
      addnum(DOC_TIME, s, docTime);
*************** void DocumentRef::Deserialize(String &st
*** 304,386 ****
      String *str;
  
  
! #define getnum(in, var) memcpy((char *) &var, in, sizeof(var)); \
! in += sizeof(var)
! #define getstring(in, str) getnum(in, length); \
! str = 0; \
! str.append(in, length); \
! in += length
! #define getlist(in, list) getnum(in, count); \
! for (i = 0; i < count; i++) \
! { \
! getnum(in, length); \
! str = new String; \
! str->append(in, length); \
! list.Add(str); \
! in += length; \
! }
  
      while (s < end)
      {
! x = *s++;
! switch (x)
          {
          case DOC_ID:
! getnum(s, docID);
              break;
          case DOC_TIME:
! getnum(s, docTime);
              break;
          case DOC_ACCESSED:
! getnum(s, docAccessed);
              break;
          case DOC_STATE:
! getnum(s, docState);
              break;
          case DOC_SIZE:
! getnum(s, docSize);
              break;
          case DOC_LINKS:
! getnum(s, docLinks);
              break;
          case DOC_IMAGESIZE:
! getnum(s, docImageSize);
              break;
          case DOC_HOPCOUNT:
! getnum(s, docHopCount);
              break;
          case DOC_BACKLINKS:
! getnum(s, docBackLinks);
              break;
          case DOC_SIG:
! getnum(s, docSig);
              break;
          case DOC_URL:
! getstring(s, docURL);
              break;
          case DOC_HEAD:
! getstring(s, docHead);
              break;
          case DOC_METADSC:
! getstring(s, docMetaDsc);
              break;
          case DOC_TITLE:
! getstring(s, docTitle);
              break;
          case DOC_DESCRIPTIONS:
! getlist(s, descriptions);
              break;
          case DOC_ANCHORS:
! getlist(s, docAnchors);
              break;
          case DOC_EMAIL:
! getstring(s, docEmail);
              break;
          case DOC_NOTIFICATION:
! getstring(s, docNotification);
              break;
          case DOC_SUBJECT:
! getstring(s, docSubject);
              break;
          case DOC_STRING:
            // This is just a debugging string. Ignore it.
--- 385,506 ----
      String *str;
  
  
! #define getnum(type, in, var) \
! if (type & CHARSIZE_MARKER_BIT) \
! { \
! var = (int) *(unsigned char *) in; \
! in += sizeof(unsigned char); \
! } \
! else if (type & SHORTSIZE_MARKER_BIT) \
! { \
! var = (int) *(unsigned short int *) in; \
! in += sizeof(unsigned short int); \
! } \
! else \
! { \
! memcpy((char *) &var, in, sizeof(var)); \
! in += sizeof(var); \
! }
!
! #define getstring(type, in, str) \
! getnum(type, in, length); \
! str = 0; \
! str.append(in, length); \
! in += length
!
! #define getlist(type, in, list) \
! getnum(type, in, count); \
! if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) \
! { \
! for (i = 0; i < count; i++) \
! { \
! unsigned char _tmp = *(unsigned char *) in; \
! in += sizeof(_tmp); \
! if (_tmp < (unsigned char) ~1) \
! length = _tmp; \
! else \
! getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \
! length); \
! str = new String; \
! str->append(in, length); \
! list.Add(str); \
! in += length; \
! } \
! } \
! else \
! { \
! for (i = 0; i < count; i++) \
! { \
! getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \
! length); \
! str = new String; \
! str->append(in, length); \
! list.Add(str); \
! in += length; \
! } \
! }
  
      while (s < end)
      {
! x = (unsigned char) *s++;
! switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))
          {
          case DOC_ID:
! getnum(x, s, docID);
              break;
          case DOC_TIME:
! getnum(x, s, docTime);
              break;
          case DOC_ACCESSED:
! getnum(x, s, docAccessed);
              break;
          case DOC_STATE:
! getnum(x, s, docState);
              break;
          case DOC_SIZE:
! getnum(x, s, docSize);
              break;
          case DOC_LINKS:
! getnum(x, s, docLinks);
              break;
          case DOC_IMAGESIZE:
! getnum(x, s, docImageSize);
              break;
          case DOC_HOPCOUNT:
! getnum(x, s, docHopCount);
              break;
          case DOC_BACKLINKS:
! getnum(x, s, docBackLinks);
              break;
          case DOC_SIG:
! getnum(x, s, docSig);
              break;
          case DOC_URL:
! getstring(x, s, docURL);
              break;
          case DOC_HEAD:
! getstring(x, s, docHead);
              break;
          case DOC_METADSC:
! getstring(x, s, docMetaDsc);
              break;
          case DOC_TITLE:
! getstring(x, s, docTitle);
              break;
          case DOC_DESCRIPTIONS:
! getlist(x, s, descriptions);
              break;
          case DOC_ANCHORS:
! getlist(x, s, docAnchors);
              break;
          case DOC_EMAIL:
! getstring(x, s, docEmail);
              break;
          case DOC_NOTIFICATION:
! getstring(x, s, docNotification);
              break;
          case DOC_SUBJECT:
! getstring(x, s, docSubject);
              break;
          case DOC_STRING:
            // This is just a debugging string. Ignore it.

brgds, H-P



This archive was generated by hypermail 2.0b3 on Wed Jan 20 1999 - 08:37:46 PST