Index: htdig/Document.h =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdig/Document.h,v retrieving revision 1.10.2.5 diff -3 -u -p -r1.10.2.5 Document.h --- htdig/Document.h 2000/01/14 01:23:43 1.10.2.5 +++ htdig/Document.h 2000/01/20 17:49:44 @@ -74,6 +74,8 @@ public: // void setUsernamePassword(const char *credentials) { authorization = credentials;} + + HtHTTP *GetHTTPHandler() { return HTTPConnect; } private: enum Index: htdig/Retriever.cc =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdig/Retriever.cc,v retrieving revision 1.72.2.15 diff -3 -u -p -r1.72.2.15 Retriever.cc --- htdig/Retriever.cc 2000/01/20 03:55:47 1.72.2.15 +++ htdig/Retriever.cc 2000/01/20 17:49:46 @@ -297,43 +297,83 @@ Retriever::Start() while (more && noSignal) { - more = 0; + more = 0; // - // Go through all the current servers in sequence. We take only one - // URL from each server during this loop. This ensures that the load - // on the servers is distributed evenly. + // Go through all the current servers in sequence. + // If they support persistent connections, we keep on popping + // from the server queue until we reach a maximum number of + // consecutive requests (so we will probably have to issue a new + // attribute, like "server_repeat_connections"). Or the loop may + // continue for the infinite, if we set the max to -1 (and maybe + // the attribute too). + // If the server doesn't support persistent connection, we take + // only an URL from it, then we skip to the next server. // + + // Let's position at the beginning servers.Start_Get(); + + int count; + + // Maximum number of repeated requests with the same + // socket connection. + int max_repeat_requests; + while ( (server = (Server *)servers.Get_NextElement()) && noSignal) { if (debug > 1) - cout << "pick: " << server->host() << ", # servers = " << + cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl; - ref = server->pop(); - if (!ref) - continue; // Nothing on this server - // There may be no more documents, or the server - // has passed the server_max_docs limit - - // - // We have a URL to index, now. We need to register the - // fact that we are not done yet by setting the 'more' - // variable. - // - more = 1; - - // - // Deal with the actual URL. - // We'll check with the server to see if we need to sleep() - // before parsing it. - // - server->delay(); // This will pause if needed and reset the time - parse_url(*ref); - delete ref; - } + // and if the Server doesn't support persistent connections + // turn it down to 1. + + // We already know if a server supports HTTP pers. connections, + // because we asked it for the robots.txt file (constructor of + // the class). + + if (server->IsPersistentConnectionAllowed()) + // Once the new attribute is set + // max_repeat_requests=config["server_repeat_connections"]; + max_repeat_requests = -1; // Set to -1 (infinite loop) + else + max_repeat_requests = 1; + + count = 0; + + while ( ( (max_repeat_requests ==-1) || + (count < max_repeat_requests) ) && + (ref = server->pop()) && noSignal) + { + count ++; + + // + // We have a URL to index, now. We need to register the + // fact that we are not done yet by setting the 'more' + // variable. So, we have to restart scanning the queue. + // + + more = 1; + + // + // Deal with the actual URL. + // We'll check with the server to see if we need to sleep() + // before parsing it. + // + + parse_url(*ref); + delete ref; + + // No HTTP connections available, so we change server and pause + if (max_repeat_requests == 1) + server->delay(); // This will pause if needed + // and reset the time + + } + } } + // if we exited on signal if (Retriever_noLog != log && !noSignal) { Index: htdig/Server.cc =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdig/Server.cc,v retrieving revision 1.17.2.6 diff -3 -u -p -r1.17.2.6 Server.cc --- htdig/Server.cc 1999/12/11 16:19:47 1.17.2.6 +++ htdig/Server.cc 2000/01/20 17:49:47 @@ -21,6 +21,7 @@ #include "Document.h" #include "URLRef.h" #include "Transport.h" +#include "HtHTTP.h" // for checking persistent connections #include @@ -38,8 +39,10 @@ Server::Server(URL u, String *local_robo _port = u.port(); _bad_server = 0; _documents = 0; - _persistent_connections = 1; // Allowed by default + // We take it from the configuration + _persistent_connections = config.Boolean("persistent_connections"); + _max_documents = config.Value("server",_host,"server_max_docs", -1); _connection_space = config.Value("server",_host,"server_wait_time", 0); _last_connection.SettoNow(); // For getting robots.txt @@ -78,7 +81,23 @@ Server::Server(URL u, String *local_robo } } else if (!local_urls_only) + { status = doc.Retrieve(timeZero); + + // Let's check if persistent connections are both + // allowed by the configuration and possible after + // having requested the robots.txt file. + + HtHTTP *http; + if (IsPersistentConnectionAllowed() && + (http = doc.GetHTTPHandler())) + { + if (! http->isPersistentConnectionPossible()) + _persistent_connections=0; // not possible. Let's disable + // them on this server. + } + + } else status = Transport::Document_not_found;