README.cpp

00001 
00002 /* 
00003         DESCRIPTION:
00004 
00005 
00006 The class 'Sym' in this package provides a reference counted, hashed tree structure that can be used in genetic programming.
00007 The hash table behind the scenes makes sure that every subtree in the application is stored only once.
00008 This has a couple of advantages:
00009 
00010 o Memory: all subtrees are stored only once
00011 o Comparison: comparison for equality for two subtrees boils down to a pointer comparison 
00012 o Overview: by accessing the hashtable, you get an instant overview of the state of the population
00013 
00014 
00015 The disadvantage of this method is the constant time overhead for computing hashes. In practice,
00016 it seems to be fast enough.
00017 
00018 
00019 ===== How to Use this =========
00020 
00021 In essence, the Sym data structure contains two important pieces of data,
00022 the 'token' (of type token_t = int) and the children, a vector of Sym (called SymVec).
00023 The token should contain all information to be able to figure out which 
00024 function/terminal is represented by the node in the tree. By retrieving this token value
00025 and the SymVec it is possible to write recursive traversal routines for evaluation, printing,
00026 etc.
00027 
00028 */
00029 
00030 #include <iostream>
00031 #include "Sym.h"
00032 
00033 using namespace std;
00034 
00035 
00036 /* 
00037  * Suppose token value '0' designates our terminal, and token value '1' designates a binary function.
00038  * Later on a ternary function will be used as well, designated with token value '2'
00039  * The function below will create a tree of size three
00040 */
00041 Sym test1() {
00042     
00043     SymVec children; 
00044     children.push_back( Sym(0) ); // push_back is a member from std::vector, SymVec is derived from std::vector
00045     children.push_back( Sym(0) );
00046 
00047     Sym tree = Sym(token_t(1), children); // creates the tree
00048 
00049     /* Done, now print some information about the node */
00050 
00051     cout << "Size      =  " << tree.size() << endl;     // prints 3
00052     cout << "Depth     = " << tree.depth() << endl;    // prints 2
00053     cout << "Refcount  = " << tree.refcount() << endl; // prints 1
00054 
00055     Sym tree2 = tree; // make a copy (this only changes refcount)
00056 
00057     cout << "Refcount now = " << tree.refcount() << endl; // print 2
00058     
00059     return tree; // tree2 will be deleted and reference count returns to 1
00060 }
00061 
00062 /* To actually use the tree, evaluate it, the following simple recursive function
00063  * can be used
00064 */
00065 
00066 int eval(const Sym& sym) {
00067     if (sym.token() == 0) { // it's a terminal in this example
00068         return 1;
00069     }
00070     // else it's the function
00071     const SymVec& children = sym.args(); // get the children out, children.size() is the arity
00072 
00073     // let's assume that we've also got a ternary function designated by token '2'
00074     
00075     if (sym.token() == token_t(1))
00076         return eval(children[0]) + eval(children[1]); // evaluate
00077 
00078     return eval(children[0]) + eval(children[1]) * eval(children[2]); // a ternary function
00079 }
00080 
00081 /* Note that you simply use the stored token that was defined above. Simply checking the size of SymVec in
00082  * this particular example could have sufficed, but it's instructive to use the tokens.
00083  * 
00084  * And to test this: 
00085 */
00086 
00087 void test_eval() {
00088     
00089     Sym tree = test1();
00090 
00091     cout << "Evaluating tree1 returns " << eval(tree) << endl;
00092 }
00093 
00094 /* Writing initialization functions.
00095  *
00096  * As the Sym class is recursive in nature, initialization can simply be done using 
00097  * recursive routines as above. As an example, the following code does 'full' initialization.
00098  */
00099 
00100 Sym init_full(int depth_left) {
00101     if (depth_left == 0) return Sym(0); // create terminal
00102     // else create either a binary or a ternary function
00103     
00104     depth_left--;
00105     
00106     if (rand() % 2 == 0) { // create binary
00107         SymVec vec(2);
00108         vec[0] = init_full(depth_left);
00109         vec[1] = init_full(depth_left);
00110 
00111         return Sym(token_t(1), vec);
00112         
00113     } else { // create ternary tree
00114         SymVec vec(3);
00115         vec[0] = init_full(depth_left);
00116         vec[1] = init_full(depth_left);
00117         vec[2] = init_full(depth_left);
00118 
00119         return Sym(token_t(2), vec); // token value 2 designates a ternary now, even though the arity can simply be read from the size of the 'SymVec'
00120     }
00121     
00122 }
00123 
00124 
00125 /* Examining the hash table.
00126  *
00127  * The hash table is a static member of the Sym class, but can be obtained and inspected
00128  * at any point during the run. The hash table follows the SGI implementation of hashmap (and effectively
00129  * uses it in gcc). An example:
00130  */
00131 
00132 void inspect_hashtable() {
00133     SymMap& dag = Sym::get_dag(); // get the hashmap
00134     unsigned i = 0;
00135     for (SymMap::iterator it = dag.begin(); it != dag.end(); ++it) {
00136         Sym node(it); // initialize a 'sym' with the iterator 
00137 
00138         cout << "Node " << i++ << " size " << node.size();
00139         cout << " refcount " << node.refcount()-1; // -1: note that by creating the Sym above the refcount is increased
00140         cout << " depth " << node.depth();
00141         cout << '\n';
00142     }
00143     
00144 }
00145 
00146 /* The above code effectively examines all distinct subtrees in use in the application and prints some stats for the node */
00147 
00148 /* Manipulating trees 
00149  *
00150  * The Sym class is set up in such a way that you cannot change a Sym, so how do you perform crossover and mutation?
00151  *
00152  * Simple, you create new syms. The Sym class supports two functions to make this easier: 'get_subtree' and 'insert_subtree'.
00153  * These traverse the tree by index, where 0 designates the root and other values are indexed depth first.
00154  */
00155 
00156 Sym subtree_xover(Sym a, Sym b) {
00157     
00158     Sym to_insert = get_subtree(a,  rand() % a.size() ); // select random subtree, will crash if too high a value is given
00159     
00160     /* 'insert' it into b. This will not really insert, it will however create a new sym,
00161      * equal to 'b' but with a's subtree inserted at the designated spot. */
00162     return insert_subtree(b, rand() % b.size(), to_insert); 
00163     
00164 }
00165 
00166 /* Tying it together, we can create a simple genetic programming system. Mutation is not implemented here,
00167  * but would be easy enough to add by using recursion and/or 'set'. */
00168 
00169 void run_gp() {
00170     
00171     int ngens = 50;
00172     int popsize = 1000;
00173 
00174     cout << "Starting running " << popsize << " individuals for " << ngens << " generations." << endl;
00175     
00176     vector<Sym> pop(popsize); 
00177     
00178     // init population
00179     for (unsigned i = 0; i < pop.size(); ++i) {
00180         pop[i] = init_full(5); 
00181     }
00182     
00183     double best = 0.0;
00184     
00185     // do a very simple steady state tournament 
00186     for (unsigned gen = 0; gen < ngens * pop.size(); ++gen) {
00187         int sel1 = rand()% pop.size();
00188         int sel2 = rand() % pop.size();
00189         int sel3 = rand() % pop.size();
00190 
00191         double ev1 = eval(pop[sel1]);
00192         double ev3 = eval(pop[sel3]);
00193         
00194         double bst = max(ev1,ev3);
00195         if (bst > best) {
00196             best = bst;
00197         }
00198         
00199         if (ev3 > ev1) {
00200             sel1 = sel3; // selection pressure
00201         }
00202 
00203         Sym child = subtree_xover(pop[sel1], pop[sel2]);
00204         
00205         // Check for uniqueness
00206         if (child.refcount() == 1) pop[ rand() % pop.size() ] = child;
00207     }
00208     
00209     // and at the end:
00210     
00211     inspect_hashtable();
00212 
00213     // and also count number of nodes in the population
00214     int sz = 0;
00215     for (unsigned i = 0; i < pop.size(); ++i) { sz += pop[i].size(); }
00216     cout << "Number of distinct nodes " << Sym::get_dag().size() << endl;
00217     cout << "Nodes in population      " << sz << endl;
00218     cout << "ratio                    " << double(Sym::get_dag().size())/sz << endl;
00219     cout << "Best fitness             " << best << endl;
00220     
00221 }
00222 
00223 /* One extra mechanism is supported to add annotations to nodes. Something derived from
00224  * 'UniqueNodeStats' can be used to attach new information to nodes. For this to function,
00225  * we need to supply a 'factory' function that creates these node-stats; attach this function to the 
00226  * Sym class, so that it gets called whenever a new node is created. The constructors of the Sym class
00227  * take care of this.
00228  *
00229  * IMPORTANT: 
00230  *      in a realistic application, the factory function needs to be set BEFORE any Syms are created
00231  *      Mixing Syms creating with and without the factory can lead to unexpected results    
00232  *
00233  * First we derive some structure from UniqueNodeStats: */
00234 
00235 struct MyNodeStats : public UniqueNodeStats {
00236     
00237     int sumsize;
00238     
00239     ~MyNodeStats() { cout << "MyNodeStats::~MyNodeStats, sumsize = " << sumsize << endl; }
00240 };
00241 
00242 /* then define the factory function. It will get a Sym, which is just created.  */
00243 UniqueNodeStats* create_stats(const Sym& sym) {
00244     MyNodeStats* stats = new MyNodeStats; // Sym will take care of memory management
00245     
00246     int sumsize = sym.size();
00247     for (unsigned i = 0; i < sym.args().size(); ++i) {
00248         // retrieve the extra node stats of the child
00249         UniqueNodeStats* unique_stats = sym.args()[i].extra_stats(); // extra_stats retrieves the stats
00250         MyNodeStats* child_stats = static_cast<MyNodeStats*>(unique_stats); // cast it to the right struct
00251         sumsize += child_stats->sumsize;
00252     }
00253     
00254     stats->sumsize = sumsize;
00255     return stats; // now it will get attached to the node and deleted when its reference count goes to zero
00256 }
00257 
00258 void test_node_stats() {
00259     
00260     if (Sym::get_dag().size() != 0) {
00261         cerr << "Cannot mix nodes with and without factory functions" << endl;
00262         exit(1);
00263     }
00264     
00265     /* Very Important: attach the factory function to the Sym class */
00266     Sym::set_factory_function(create_stats);
00267 
00268     Sym tree = init_full(5); // create a tree
00269     
00270     // get extra node stats out
00271     MyNodeStats* stats = static_cast<MyNodeStats*>( tree.extra_stats() );
00272 
00273     cout << "Size = " << tree.size() << " SumSize = " << stats->sumsize << endl;
00274     
00275     Sym::clear_factory_function(); // reset
00276 }
00277 
00278 
00279 /* And run the code above */
00280 
00281 int main() {
00282     srand(time(0));
00283     cout << "********** TEST EVALUATION **************\n";
00284     test_eval();
00285     cout << "********** TEST ALGORITHM ***************\n";
00286     run_gp();
00287 
00288     cout << "********** TEST FACTORY  ****************\n";
00289     test_node_stats(); // can work because there are no live nodes
00290 
00291 }
00292 
00293 /* ********** Member function reference: ********************
00294  *
00295  * Sym()            The default constructor will create an undefined node (no token and no children), check for empty() to see if a node is undefined
00296  * 
00297  * Sym(token_t)     Create a terminal
00298  *
00299  * Sym(token_t, const SymVec&)
00300  *                  Create a node with token and SymVec as the children
00301  * 
00302  * Sym(SymIterator it)
00303  *                  Create a sym from an iterator (taken from the hashtable directly, or from Sym::iterator)
00304  *
00305  * dtor, copy-ctor and assignment
00306  *
00307  * UniqueNodeStats* extra_stats()    
00308  *                  Returns an UniqueNodeStats pointer (= 0 if no factory is defined)
00309  * 
00310  * 
00311  * int hashcode()   returns the hashcode for the node
00312  * 
00313  * int refcount()   returns the reference count for the node
00314  * 
00315  * bool operator==  checks for equality (note that this is a pointer compare, really really fast)
00316  * 
00317  * bool empty()     returns whether the node is undefined, i.e. created through the default ctor 
00318  * 
00319  * int arity()      shorthand for sym.args().size()
00320  * 
00321  * token_t token()  return identifying token for the node
00322  * 
00323  * const SymVec& args()
00324  *                  returns the children of the node (in a vector<Sym>)
00325  *                  
00326  * unsigned size()  returns the size, i.e., number of nodes
00327  * 
00328  * unsigned depth() returns the depth
00329  * 
00330  * iterator()       returns the pointer to the node in the hashtable
00331  *
00332  * 
00333  ********** Static functions: ********************
00334  *
00335  *
00336  * 
00337  * SymMap& get_dag()    returns the hash table containing all nodes. This should only be used for inspection,
00338  *                      even though the dag itself is not const. This to enable the use of the ctor Sym(SymIterator) to inspect
00339  *                      using the Sym interface (rather than the hash table interface). This does allow you to make destructive
00340  *                      changes to the class, so use with care
00341  *
00342  * set_factory_function( UniqueNodeStats (*)(const Sym&) )
00343  *                      Set the factory function
00344  *
00345  *  clear_factory_function()    
00346  *                      Clears the factory function, allocated UniqueNodeStats will still be deleted, but no new ones will be created.
00347  *
00348  ********** Utility Functions ******************** 
00349  * 
00350  * Sym get_subtree(const Sym& org, int i)
00351  *                      Retreive the i-th subtree from the Sym. Standard depth first ordering, where root has index 0 and the
00352  *                      rightmost terminal has index sym.size()-1
00353  *
00354  * Sym insert_subtree(const Sym& org, int i, const Sym& subtree)
00355  *                      Returns a Sym that is equal to 'org', for which the i-th subtree (same ordering as get_subtree) is replaced
00356  *                      by the third argument subtree.
00357  * 
00358  * Sym next(const Sym&)
00359  *                      Returns the successor of the argument sym from the hashtable with wrap around. This is implemented just because
00360  *                      it can be done. It may be an interesting way to mutate...
00361  * 
00362  * */
00363 
00364 

Generated on Thu Oct 19 05:06:42 2006 for EO by  doxygen 1.3.9.1