htdig: PATCH: Merge Database Code


Geoff Hutchison (ghutchis@wso.williams.edu)
Thu, 7 Jan 1999 10:15:34 -0500 (EST)


OK, I'm finally finished with the code to merge two databases. This has
received *VERY* limited testing. Use at your own risk, I'm sure it's
crawling with bugs--I'm posting it in hopes that people will try it out
and send me feedback and bug reports.

Once again, don't try it on any databases you care about.

So here's how you use it... After applying the patch and recompiling,
htmerge still works as normal. But if you want to merge two database sets
as specified in separate config files, do the following:

htmerge (options) -m /path/to/source.conf (-c /path/to/destination.conf)

The -c is, of course, optional. If you don't specify it, it will take your
default config file. When it gets going, it will first merge the databases
specified by the config file of -m and then do its normal duties (e.g.
making sure the destination DBs have the db.docs.index and db.words.db)...

Once this is given a green light, I'll post a script that makes merging
more than two databases easier.

As I said, I welcome feedback and bug reports. I expect I'll get lots of
the latter. :-)

-Geoff Hutchison
Williams Students Online
http://wso.williams.edu/

diff -c3pN htdig3/htmerge/Makefile.in htdig3.dev/htmerge/Makefile.in
*** htdig3/htmerge/Makefile.in Wed Jan 6 12:38:51 1999
--- htdig3.dev/htmerge/Makefile.in Thu Jan 7 00:20:44 1999
*************** INSTALL= @INSTALL@
*** 6,12 ****
  top_builddir= ..
  include $(top_builddir)/Makefile.config
  
! OBJS= docs.o htmerge.o words.o
  
  TARGET= htmerge
  LOCAL_DEFINES= -DSORT_PROG=\"@SORT@\"
--- 6,12 ----
  top_builddir= ..
  include $(top_builddir)/Makefile.config
  
! OBJS= db.o docs.o htmerge.o words.o
  
  TARGET= htmerge
  LOCAL_DEFINES= -DSORT_PROG=\"@SORT@\"
diff -c3pN htdig3/htmerge/db.cc htdig3.dev/htmerge/db.cc
*** htdig3/htmerge/db.cc Wed Dec 31 19:00:00 1969
--- htdig3.dev/htmerge/db.cc Thu Jan 7 10:06:02 1999
***************
*** 0 ****
--- 1,289 ----
+ //
+ // db.cc
+ //
+ // Implementation of merging databases. Uses two config files to specify which
+ // sets of databases to merge. Only adds the data in, assumes mergeWords and
+ // convertDocs are performed to ensure database integrity.
+ //
+ // $Log: db.cc,v $
+ //
+ //
+
+ #include "htmerge.h"
+
+ //*****************************************************************************
+ // void mergeDB()
+ //
+ void
+ mergeDB()
+ {
+ DocumentDB merge_db, db;
+ List *urls;
+ Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore
+ char *doc_db, *merge_doc_db;
+ int docIDOffset;
+
+ doc_db = config["doc_db"];
+ if (db.Open(doc_db) < 0)
+ {
+ reportError(form("Unable to open/create document database '%s'",
+ doc_db));
+ }
+
+ merge_doc_db = merge_config["doc_db"];
+ if (merge_db.Open(merge_doc_db) < 0)
+ {
+ reportError(form("Unable to open document database '%s'",
+ merge_doc_db));
+ }
+
+ // Start the merging by going through all the URLs that are in
+ // the database to be merged
+
+ urls = merge_db.URLs();
+ // This ensures that every document added from merge_db has a unique ID
+ // in the new database
+ docIDOffset = db.NextDocID();
+
+ urls->Start_Get();
+ String *url;
+ String id;
+ while ((url = (String *) urls->Get_Next()))
+ {
+ DocumentRef *ref = merge_db[url->get()];
+ DocumentRef *old_ref = db[url->get()];
+ if (!ref)
+ continue;
+
+ if (old_ref)
+ {
+ // Oh well, we knew this would happen. Let's get the duplicate
+ // And we'll only use the most recent date.
+
+ if ( old_ref->DocTime() > ref->DocTime() )
+ {
+ // Cool, the ref we're merging is too old, just ignore it
+ char str[20];
+ sprintf(str, "%d", ref->DocID());
+ merge_dup_ids.Add(str, 0);
+
+ if (verbose > 1)
+ {
+ cout << "htmerge: Duplicate, URL: " << url << " ignoring merging copy \n";
+ cout.flush();
+ }
+ }
+ else
+ {
+ // The ref we're merging is newer, delete the old one and add
+ char str[20];
+ sprintf(str, "%d", old_ref->DocID());
+ db_dup_ids.Add(str, 0);
+ db.Delete(url->get());
+ ref->DocID(ref->DocID() + docIDOffset);
+ db.Add(*ref);
+ if (verbose > 1)
+ {
+ cout << "htmerge: Duplicate, URL: ";
+ cout << url->get() << " ignoring destination copy \n";
+ cout.flush();
+ }
+ }
+ }
+ else
+ {
+ // it's a new URL, just add it
+ ref->DocID(ref->DocID() + docIDOffset);
+ db.Add(*ref);
+ if (verbose > 1)
+ {
+ cout << "htmerge: Merged URL: " << url->get() << " \n";
+ cout.flush();
+ }
+ }
+ delete ref;
+ delete old_ref;
+ }
+ delete urls;
+ merge_db.Close();
+ db.Close();
+
+ //
+ // Now we go through the original wordlist and remove deleted docIDs
+ //
+
+ char *wordtmp = config["word_list"];
+ FILE *wordlist = fopen(form("%s.new", wordtmp), "w");
+ FILE *dbwords = fopen(wordtmp, "r");
+ char buffer[1000];
+ String word;
+ char *sid;
+ char *name, *value, *pair;
+ WordRecord wr;
+
+ // Check for file access errors
+ if (!wordlist)
+ {
+ reportError(form("Unable to create temporary word file '%s.new'",
+ wordtmp));
+ }
+ if (!dbwords)
+ {
+ reportError(form("Unable to open word list file '%s'", wordtmp));
+ }
+
+ // Read it in a line at a time...
+ while (fgets(buffer, sizeof(buffer), dbwords))
+ {
+ // Split the line up into the word, count, location, and
+ // document id, just like in words.cc(mergeWords).
+ word = good_strtok(buffer, "\t");
+ pair = good_strtok("\t");
+ wr.Clear(); // Reset count to 1, anchor to 0, and all that
+ sid = "-";
+ while (pair && *pair)
+ {
+ name = strtok(pair, ":");
+ value = strtok(0, "\n");
+ if (name && *name && value && *value)
+ {
+ switch (*name)
+ {
+ case 'c':
+ wr.count = atoi(value);
+ break;
+ case 'l':
+ wr.location = atoi(value);
+ break;
+ case 'i':
+ sid = value;
+ wr.id = atoi(value);
+ break;
+ case 'w':
+ wr.weight = atoi(value);
+ break;
+ case 'a':
+ wr.anchor = atoi(value);
+ break;
+ }
+ }
+ pair = good_strtok("\t");
+ }
+
+ // OK, now we have to check if this word was from a doc we discarded.
+ if (db_dup_ids.Exists(sid))
+ {
+ if (verbose > 1)
+ {
+ cout << "htmerge: Discarding duplicate " << word << " in doc #"
+ << sid << " \n";
+ cout.flush();
+ }
+ continue;
+ }
+
+ // Record the word in the new file
+ fprintf(wordlist, "%s", word.get());
+ if (wr.count != 1)
+ {
+ fprintf(wordlist, "\tc:%d", wr.count);
+ }
+ fprintf(wordlist, "\tl:%d\ti:%d\tw:%d",
+ wr.location,
+ wr.id,
+ wr.weight);
+ if (wr.anchor != 0)
+ {
+ fprintf(wordlist, "\ta:%d", wr.anchor);
+ }
+ putc('\n', wordlist);
+ }
+ fclose(dbwords);
+ db_dup_ids.Destroy(); // Save some memory
+
+ // Now wordlist is at the end of its current stream, so we're set to write
+ // the new merged data.
+
+ FILE *mergewords = fopen(merge_config["word_list"], "r");
+
+ // Check for file access errors
+ if (!mergewords)
+ {
+ reportError(form("Unable to open word list file '%s'", merge_config["word_list"]));
+ }
+
+ // Read it in a line at a time...
+ while (fgets(buffer, sizeof(buffer), mergewords))
+ {
+ // Split the line up into the word, count, location, and
+ // document id, just like in words.cc(mergeWords).
+ word = good_strtok(buffer, "\t");
+ pair = good_strtok("\t");
+ wr.Clear(); // Reset count to 1, anchor to 0, and all that
+ sid = "-";
+ while (pair && *pair)
+ {
+ name = strtok(pair, ":");
+ value = strtok(0, "\n");
+ if (name && *name && value && *value)
+ {
+ switch (*name)
+ {
+ case 'c':
+ wr.count = atoi(value);
+ break;
+ case 'l':
+ wr.location = atoi(value);
+ break;
+ case 'i':
+ sid = value;
+ wr.id = atoi(value);
+ break;
+ case 'w':
+ wr.weight = atoi(value);
+ break;
+ case 'a':
+ wr.anchor = atoi(value);
+ break;
+ }
+ }
+ pair = good_strtok("\t");
+ }
+
+ // OK, now we have to check if this word was from a doc we discarded.
+ if (merge_dup_ids.Exists(sid))
+ {
+ if (verbose > 1)
+ {
+ cout << "htmerge: Discarding merged duplicate " << word << " in doc #"
+ << sid << " \n";
+ cout.flush();
+ }
+ continue;
+ }
+
+ // Record the word in the new file
+ fprintf(wordlist, "%s", word.get());
+ if (wr.count != 1)
+ {
+ fprintf(wordlist, "\tc:%d", wr.count);
+ }
+ fprintf(wordlist, "\tl:%d\ti:%d\tw:%d",
+ wr.location,
+ wr.id,
+ wr.weight);
+ if (wr.anchor != 0)
+ {
+ fprintf(wordlist, "\ta:%d", wr.anchor);
+ }
+ putc('\n', wordlist);
+ }
+ fclose(mergewords);
+
+ // Deal with the new wordlist file. We need to replace the old file with
+ // the new one.
+ fclose(wordlist);
+ unlink(wordtmp);
+ link(form("%s.new", wordtmp), wordtmp);
+ unlink(form("%s.new", wordtmp));
+ }
diff -c3pN htdig3/htmerge/htmerge.cc htdig3.dev/htmerge/htmerge.cc
*** htdig3/htmerge/htmerge.cc Fri Dec 4 19:52:04 1998
--- htdig3.dev/htmerge/htmerge.cc Thu Jan 7 09:55:59 1999
***************
*** 13,25 ****
  //
  // Revision 1.7 1998/12/04 04:13:51 ghutchis
  // Use configure check to only include getopt.h when it exists.
! //
  // Revision 1.5 1998/10/02 17:07:32 ghutchis
- //
  // More configure changes
  //
  // Revision 1.4 1998/08/03 16:50:43 ghutchis
- //
  // Fixed compiler warnings under -Wall
  //
  // Revision 1.3 1998/01/05 05:43:24 turtle
--- 13,23 ----
  //
  // Revision 1.7 1998/12/04 04:13:51 ghutchis
  // Use configure check to only include getopt.h when it exists.
! //
  // Revision 1.5 1998/10/02 17:07:32 ghutchis
  // More configure changes
  //
  // Revision 1.4 1998/08/03 16:50:43 ghutchis
  // Fixed compiler warnings under -Wall
  //
  // Revision 1.3 1998/01/05 05:43:24 turtle
***************
*** 47,59 ****
  //
  Dictionary discard_list;
  
  int verbose = 0;
  int stats = 0;
  
  void usage();
  void reportError(char *msg);
  
-
  //*****************************************************************************
  // int main(int ac, char **av)
  //
--- 45,59 ----
  //
  Dictionary discard_list;
  
+ // This config is used for merging multiple databses
+ Configuration merge_config;
+
  int verbose = 0;
  int stats = 0;
  
  void usage();
  void reportError(char *msg);
  
  //*****************************************************************************
  // int main(int ac, char **av)
  //
*************** int main(int ac, char **av)
*** 63,73 ****
      int do_docs = 1;
      int alt_work_area = 0;
      String configfile = DEFAULT_CONFIG_FILE;
      int c;
      /* Currently unused extern int optind; */
      extern char *optarg;
  
! while ((c = getopt(ac, av, "svc:dwa")) != -1)
      {
          switch (c)
          {
--- 63,74 ----
      int do_docs = 1;
      int alt_work_area = 0;
      String configfile = DEFAULT_CONFIG_FILE;
+ String merge_configfile = 0;
      int c;
      /* Currently unused extern int optind; */
      extern char *optarg;
  
! while ((c = getopt(ac, av, "svm:c:dwa")) != -1)
      {
          switch (c)
          {
*************** int main(int ac, char **av)
*** 80,85 ****
--- 81,89 ----
              case 'c':
                  configfile = optarg;
                  break;
+ case 'm':
+ merge_configfile = optarg;
+ break;
              case 'v':
                  verbose++;
                  break;
*************** int main(int ac, char **av)
*** 104,109 ****
--- 108,124 ----
      }
          
      config.Read(configfile);
+
+ if (merge_configfile.length())
+ {
+ merge_config.Defaults(&defaults[0]);
+ if (access(merge_configfile, R_OK) < 0)
+ {
+ reportError(form("Unable to find configuration file '%s'",
+ merge_configfile.get()));
+ }
+ merge_config.Read(merge_configfile);
+ }
  
      if (alt_work_area != 0)
      {
*************** int main(int ac, char **av)
*** 138,143 ****
--- 153,168 ----
          }
      }
      
+ if (merge_configfile.length())
+ {
+ // Merge the databases specified in merge_configfile into the current
+ // databases. Do this first then update the other databases as usual
+ // Note: We don't have to specify anything, it's all in the config vars
+
+ mergeDB();
+ }
+
+
      String file1, file2;
      if (do_words)
      {
*************** int main(int ac, char **av)
*** 164,170 ****
  //
  void usage()
  {
! cout << "usage: htmerge [-v][-d][-w][-c configfile]\n";
      cout << "This program is part of ht://Dig " << VERSION << "\n\n";
      cout << "Options:\n";
      cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
--- 189,195 ----
  //
  void usage()
  {
! cout << "usage: htmerge [-v][-d][-w][-c configfile][-m merge_configfile]\n";
      cout << "This program is part of ht://Dig " << VERSION << "\n\n";
      cout << "Options:\n";
      cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
*************** void usage()
*** 173,178 ****
--- 198,206 ----
      cout << "\t\tgives a progress on what it is doing and where it is.\n\n";
      cout << "\t-d\tDo NOT merge the document database.\n\n";
      cout << "\t-w\tDo NOT merge the word database.\n\n";
+ cout << "\t-m merge_configfile\n";
+ cout << "\t\tMerge the databases specified into the databases specified by\n";
+ cout << "\t\t-c or the default.\n\n";
      cout << "\t-c configfile\n";
      cout << "\t\tUse the specified configuration file instead on the\n";
      cout << "\t\tdefault.\n\n";
diff -c3pN htdig3/htmerge/htmerge.h htdig3.dev/htmerge/htmerge.h
*** htdig3/htmerge/htmerge.h Sun Mar 23 23:33:23 1997
--- htdig3.dev/htmerge/htmerge.h Thu Jan 7 00:16:15 1999
*************** extern int n_array_elements;
*** 33,39 ****
--- 33,42 ----
  extern Dictionary discard_list;
  extern int verbose;
  extern int stats;
+ extern Configuration merge_config;
  
+
+ void mergeDB();
  void mergeWords(char *wordtmp, char *wordfile);
  void convertDocs(char *docs, char *docgdbm);
  void sort(char *);

----------------------------------------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig-request@sdsu.edu containing the single word "unsubscribe" in
the body of the message.



This archive was generated by hypermail 2.0b3 on Thu Jan 07 1999 - 07:52:40 PST