Ignore:
Timestamp:
05/04/23 01:45:37 (12 months ago)
Author:
Maciej Komosinski
Message:

Simplify sequences of modifier genes, cancelling out antagonistic ones and limiting the number of identical genes

File:
1 edited

Legend:

Unmodified
Added
Removed
  • cpp/frams/genetics/f4/f4_general.cpp

    r1232 r1234  
    77
    88#include "f4_general.h"
    9 #include "../genooperators.h" //for GENOPER_ constants
     9#include "../genooperators.h" // for GENOPER_ constants
    1010#include <common/nonstd_stl.h>
    1111#include <common/log.h>
     
    306306                                break;
    307307                        }
    308                         case 'r':  case 'R':
     308                        case 'r':
     309                        case 'R':
    309310                        {
    310311                                // error: if neuron
     
    340341                                // error: if neuron
    341342                                if (type == CELL_NEURON) //some neurons have the same single-letter names as modifiers (for example G,S,D), but they are supposed to have is_neuroclass==true so they should indeed not be handled here
    342                                 {//however, what we see here is actually modifiers such as IdqEbWL (so not valid neuroclasses) that occurred within an already differentiated cell type==CELL_NEURON.
     343                                {//however, what we see here is actually modifiers such as IdqEbWL (so not valid neuroclasses) that occurred within an already differentiated cell of type==CELL_NEURON.
    343344                                        //printf("Handled as a modifier, but type==CELL_NEURON: '%c'\n", name);
    344345                                        // fix: delete it
     
    691692int f4_Cells::simulate()
    692693{
    693         constexpr bool print_debugging = false; //print the state of cells during development
     694        const bool PRINT_CELLS_DEVELOPMENT = false; //print the state of cells
    694695        errorcode = GENOPER_OK;
    695696
    696697        for (int i = 0; i < cell_count; i++)  C[i]->active = true;
    697698
    698         if (print_debugging) f4_Node::print_tree(C[0]->genot, 0);
    699         if (print_debugging) print_cells("Initialization");
     699        if (PRINT_CELLS_DEVELOPMENT) f4_Node::print_tree(C[0]->genot, 0);
     700        if (PRINT_CELLS_DEVELOPMENT) print_cells("Initialization");
    700701
    701702        // execute oneStep() in a cycle
    702         while (oneStep()) if (print_debugging) print_cells("Development step");
    703         if (print_debugging) print_cells("After last development step");
     703        while (oneStep()) if (PRINT_CELLS_DEVELOPMENT) print_cells("Development step");
     704        if (PRINT_CELLS_DEVELOPMENT) print_cells("After last development step");
    704705
    705706#ifdef EXTRA_STEP_CELL_DEVELOPMENT
    706707        if (errorcode == GENOPER_OK)
    707708        {
    708                 oneStep(); if (print_debugging) print_cells("After extra step"); //for these "halted" (yielding) cells (they have active==false) that wait for other cells to develop. Without this step, the last, recently halted one(s) may miss the "retrying" step if all active==true cells became active==false in the last step.
     709                oneStep(); if (PRINT_CELLS_DEVELOPMENT) print_cells("After extra step"); //for these "halted" (yielding) cells (they have active==false) that wait for other cells to develop. Without this step, the last, recently halted one(s) may miss the "retrying" step if all active==true cells became active==false in the last step.
    709710        }
    710711#endif
     
    745746        //DB( printf("Cell simulation done, %d cells. \n", nc); )
    746747
    747         if (print_debugging) print_cells("Final");
     748        if (PRINT_CELLS_DEVELOPMENT) print_cells("Final");
    748749
    749750        return errorcode;
     
    10581059{
    10591060        for (int i = 0; i < indent; i++) printf(" ");
    1060         printf("%s (%d)", root->name.c_str(), root->count());
     1061        printf("%s%s%s (%d)", root->neuclass != NULL ? "N:" : "", root->name.c_str(), root->name == "#" ? std::to_string(root->reps).c_str() : "", root->count() - 1);
    10611062        if (root->name == "[")
    10621063                printf("     from=%-3d  weight=%g", root->conn_from, root->conn_weight);
     
    12531254}
    12541255
    1255 // scan genotype string and build tree
     1256// scan genotype string and build a tree
    12561257// return >1 for error (errorpos)
    1257 int f4_processRecur(const char* genot, int &pos_inout, f4_Node *parent)
    1258 {
     1258int f4_processRecur(const char* genot, const int genot_len, int &pos_inout, f4_Node *parent)
     1259{
     1260        static const char *all_modifiers_no_comma = F14_MODIFIERS; //I did experiments with added comma (see all_modifiers_for_simplify below) which had the advantage of commas not breaking sequences of modifiers, thus longer sequences of modifiers (including commas) could be simplified and genetic bloat was further reduced. But since we impose a limit on the number of modifier chars in GenoOperators::simplifiedModifiers(), it would also influence commas (e.g. no more than 8 commas per sequence), so in order to leave commas entirely unlimited let's exclude them from simplification. Note that currently 'X' or any other non-F14_MODIFIERS char also separates the sequence to be simplified, so if we wanted a really intensive simplification, it should occur during development, when we know precisely which genes influence each f4_Cell.
     1261        //const char *Geno_f4::all_modifiers_for_simplify = F14_MODIFIERS ",\1"; //'\1' added to keep the number of chars even, avoid exceptions in logic and save the simple rule that the sequence is made of pairs (gene,contradictory gene), where a comma has no contradictory gene and \1 is unlikely to occur in the f4 genotype (and not allowed), so no risk it will cancel out a comma during simplification.
     1262
     1263
    12591264        f4_Node *par = parent;
    12601265
    1261         if (pos_inout >= (int)strlen(genot))
    1262                 return (int)strlen(genot) + 1;
    1263 
    1264         while (pos_inout < (int)strlen(genot))
    1265         {
    1266 //#define PRINT_PARSING_LOCATION
    1267 #ifdef PRINT_PARSING_LOCATION
    1268                 printf("%s\n", genot);
    1269                 for (int i = 0; i < pos_inout; i++) printf(" ");
    1270                 printf("^\n");
    1271 #endif
     1266        if (pos_inout >= genot_len)
     1267                return genot_len + 1;
     1268
     1269        while (pos_inout < genot_len)
     1270        {
     1271                const bool PRINT_PARSING_LOCATION = false;
     1272                if (PRINT_PARSING_LOCATION)
     1273                {
     1274                        printf("%s\n", genot);
     1275                        for (int i = 0; i < pos_inout; i++) printf(" ");
     1276                        printf("^\n");
     1277                }
    12721278                switch (genot[pos_inout])
    12731279                {
     
    12771283                        par = node;
    12781284                        pos_inout++; //move after '<'
    1279                         int res = f4_processRecur(genot, pos_inout, par);
     1285                        int res = f4_processRecur(genot, genot_len, pos_inout, par);
    12801286                        if (res) return res;
    1281                         if (pos_inout < (int)strlen(genot))
    1282                         {
    1283                                 res = f4_processRecur(genot, pos_inout, par);
     1287                        if (pos_inout < genot_len)
     1288                        {
     1289                                res = f4_processRecur(genot, genot_len, pos_inout, par);
    12841290                                if (res) return res;
    12851291                        }
     
    12871293                        {
    12881294                                //MacKo 2023-04, more strict behavior: instead of silent repair (no visible effect to the user, genotype stays invalid but is interpreted and reported as valid), we now point out where the error is. For example <X> or <X><X or <X><N:N>
    1289                                 return (int)strlen(genot) + 1;
     1295                                return genot_len + 1;
    12901296                                //old silent repair:
    1291                                 //node = new f4_Node(">", par, int(strlen(genot)) - 1);
     1297                                //node = new f4_Node(">", par, genot_len - 1);
    12921298                        }
    12931299                        return 0;  // OK
     
    13101316                        // skip number
    13111317                        pos_inout += end - (genot + pos_inout);
    1312                         int res = f4_processRecur(genot, pos_inout, node);
     1318                        int res = f4_processRecur(genot, genot_len, pos_inout, node);
    13131319                        if (res) return res;
    1314                         if (pos_inout < (int)strlen(genot))
    1315                         {
    1316                                 res = f4_processRecur(genot, pos_inout, node);
     1320                        if (pos_inout < genot_len)
     1321                        {
     1322                                res = f4_processRecur(genot, genot_len, pos_inout, node);
    13171323                                if (res) return res;
    13181324                        }
    13191325                        else // ran out
    13201326                        {
    1321                                 return (int)strlen(genot) + 1; //MacKo 2023-04: report an error, better to be more strict instead of a silent repair (genotype stays invalid but is interpreted and reported as valid) with non-obvious consequences?
     1327                                return genot_len + 1; //MacKo 2023-04: report an error, better to be more strict instead of a silent repair (genotype stays invalid but is interpreted and reported as valid) with non-obvious consequences?
    13221328                                //earlier apporach - silently treating this problem (we don't ever see where the error is because it gets corrected in some way here, while parsing the genotype, and error location in the genotype is never reported):
    1323                                 //node = new f4_Node(">", par, int(strlen(genot)) - 1); // check if needed and if this is really the best repair operation; seemed to happen too many times in succession for some genotypes even though they were only a result of f4 operators, not manually created... and the operators should not generate invalid genotypes, right? Or maybe crossover does? Seems like too many #N's for closing >'s; removing #N or adding > helped. Operators somehow don't do it properly sometimes? But F4_ADD_REP adds '>'... (TODO)
     1329                                //node = new f4_Node(">", par, genot_len - 1); // check if needed and if this is really the best repair operation; seemed to happen too many times in succession for some genotypes even though they were only a result of f4 operators, not manually created... and the operators should not generate invalid genotypes, right? Or maybe crossover does? Seems like too many #N's for closing >'s; removing #N or adding > helped. Operators somehow don't do it properly sometimes? But F4_ADD_REP adds '>'... (TODO)
    13241330                        }
    13251331                        return 0;  // OK
     
    13911397                        break;
    13921398                }
    1393                 default: // 'X' and ',' and all modifiers and also invalid symbols - add a node, for invalid symbols build will give the error or repair
     1399                default: // 'X' and ',' and all modifiers and also invalid symbols - add a node. For symbols that are not valid in f4, the cell development process will give the error or repair
    13941400                {
    13951401                        //printf("any regular character '%c'\n", genot[pos_inout]);
    1396                         //TODO here: read a continuous sequence of modifiers, sort and optimize ("collapse") it like in f1, then add to tree
     1402#define F4_SIMPLIFY_MODIFIERS //avoid long sequences like ...<X>llmlIilImmimiimmimifmfl<fifmmimilimmmiimiliffmfliIfififlliflimfliffififmiffmflllfflimlififfiiffifIr<r<... - another option, instead of simplifying while parsing here, would be mutations: when they add/modify/remove a modifier node, they could "clean" the tree by removing nodes when they encounter contradictory modifiers on the same subpath, and also limit the number of modifiers just as GenoOperators::simplifiedModifiers() does.
     1403#ifdef F4_SIMPLIFY_MODIFIERS
     1404                        char *ptr = (char*)(genot + pos_inout);
     1405
     1406#ifdef __BORLANDC__ // "[bcc32c Error] cannot compile this non-trivial TLS destruction yet" (C++B 10.4u2)
     1407                        static
     1408#else
     1409                        thread_local
     1410#endif
     1411                                vector<int> modifs_counts(strlen(all_modifiers_no_comma)); ///<an array with a known constant size storing counters of each modifier symbol from all_modifiers_no_comma, created once to avoid reallocation every time when modifier genes are simplified during parsing. Initialization of required size; it will never be resized.
     1412                        std::fill(modifs_counts.begin(), modifs_counts.end(), 0); //zeroing only needed if we encountered a char from all_modifiers_no_comma and enter the 'while' loop below
     1413
     1414                        while (char *m = GenoOperators::strchrn0(all_modifiers_no_comma, *ptr)) //only processes a section of chars known in all_modifiers_no_comma, other characters will exit the loop
     1415                        {
     1416                                modifs_counts[m - all_modifiers_no_comma]++;
     1417                                GenoOperators::skipWS(++ptr); //advance and ignore whitespace
     1418                        }
     1419                        int advanced = ptr - (genot + pos_inout);
     1420                        if (advanced > 0) //found modifiers
     1421                        {
     1422                                string simplified = GenoOperators::simplifiedModifiers(all_modifiers_no_comma, modifs_counts);
     1423                                // add a node for each char in "simplified"
     1424                                for (size_t i = 0; i < simplified.length(); i++)
     1425                                {
     1426                                        int pos = GenoOperators::strchrn0(genot + pos_inout, simplified[i]) - genot; //unnecessarily finding the same char, if it occurrs multiple times in simplified
     1427                                        f4_Node *node = new f4_Node(simplified[i], par, pos); //location is approximate. In the simplification process we don't trace where the origin(s) of the simplified[i] gene were. We provide 'pos' as the first occurrence of simplified[i] (for example, all 'L' will have the same location assigned, but at least this is where 'L' occurred in the genotype, so in case of any modification of a node (repair, removal, whatever... even mapping of genes) the indicated gene will be one of the responsible ones)
     1428                                        par = node;
     1429                                }
     1430                                pos_inout += advanced;
     1431                        }
     1432                        else // genot[pos_inout] is a character not present in all_modifiers_no_comma, so treat it as a regular individual char just as it would be without simplification
     1433                        {
     1434                                f4_Node *node = new f4_Node(genot[pos_inout], par, pos_inout);
     1435                                par = node;
     1436                                pos_inout++;
     1437                        }
     1438#else
    13971439                        f4_Node *node = new f4_Node(genot[pos_inout], par, pos_inout);
    13981440                        par = node;
    13991441                        pos_inout++;
     1442#endif // F4_SIMPLIFY_MODIFIERS
    14001443                        break;
    14011444                }
     
    14061449        if (par && par->name != ">")
    14071450        {
    1408                 //happens when pos_inout == strlen(genot)
     1451                //happens when pos_inout == genot_len
    14091452                //return pos_inout; //MacKo 2023-04: could report an error instead of silent repair, but repair operators only work in Cells (i.e., after the f4_Node tree has been parsed without errors and Cells can start developing) so we don't want to make a fatal error because of missing '>' here. Also after conversions from Cells to text, trailing '>' is deliberately removed... and also the simplest genotype is officially X, not X>.
    1410                 new f4_Node('>', par, int(strlen(genot)) - 1);
     1453                new f4_Node('>', par, genot_len - 1);
    14111454        }
    14121455
     
    14171460{
    14181461        int pos = 0;
    1419         int res = f4_processRecur(genot, pos, root);
     1462        int res = f4_processRecur(genot, (int)strlen(genot), pos, root);
    14201463        if (res > 0)
    14211464                return res; //parsing error
Note: See TracChangeset for help on using the changeset viewer.