Bitstream tree Mark McIlroy June 2022 This tree is a data structure which is a type of Binary Search tree. A Bitstream tree is naturally balanced it does not require automatic or manual rebalancing. Any binary data key can be used for insert and lookup. For sorted output, the key must be an ASCII text key, integer, or floating point number converted to text and zero padded. The maximum depth of the tree is the number of bits in the key. Basic structure The basic insertion algorithm is 1. Start at the top of the tree. 2. For each bit in the key from left to right, if the bit is zero go down the left path if the bit is one go down the right path if reaching a leaf node, add new nodes below to enable the above two statements to proceed 3. Once all bits have been processed we will have reached depth 'n' where 'n' is the number of bits in the key. Add the data item at this position. The same procedure is used to search and locate a data item by its key. A preorder binary search tree traversal for a sorted output is required to ensure that 'aa' comes before 'aaa'. This structure has low space efficiency for a small number of long keys, however the space efficiency rises to 50% for a full tree. Basic algorithm function int bst_insert_type_1( byreference bst_tree_node bst_tree, string key, link general data_ptr ) { var int bit; var int num_key_bits; var binary key_bits; var link bst_tree_node bst_nptr; var int depth; num_key_bits = slength( key ) * 8; key_bits = cstring_to_binary( key, false ); bst_nptr = & bst_tree; depth = 0; while (depth < num_key_bits) { bit = bget_bit( key_bits, depth ); if (bit == 0) { if (bst_nptr.lchild == NULL_LINK) { bst_nptr.lchild = new; bst_nptr.lchild.is_set = false; bst_nptr.lchild.duplicate_count = 0; bst_nptr.lchild.lchild = NULL_LINK; bst_nptr.lchild.rchild = NULL_LINK; } bst_nptr = bst_nptr.lchild; } else { if (bst_nptr.rchild == NULL_LINK) { bst_nptr.rchild = new; bst_nptr.rchild.is_set = false; bst_nptr.rchild.duplicate_count = 0; bst_nptr.rchild.lchild = NULL_LINK; bst_nptr.rchild.rchild = NULL_LINK; } bst_nptr = bst_nptr.rchild; } depth++; } result = 0; if (bst_nptr.is_set) { bst_nptr.data_ptr = data_ptr; bst_nptr.duplicate_count ++; result = 1; } bst_nptr.key = key; bst_nptr.is_set = true; } Complete structure The method above is extremely simple to implement and is fast to insert/search. However every data item is at a depth of 'n' in the tree where 'n' is the number of bits in its key, which is not efficient for long keys. The algorithm below is the full implementation which stores multiple bits in each node. // Bitstream tree. // // The maximum depth of an item in the tree is the number of bits in the item's key. // // For sorted output the key must be a text string, int, or floating point number converted to text and zero padded. // // For insert/lookup only any binary key can be used. /* Scan loop: bst_prep_for_scan( tree ); bst_nptr = tree.next; while (bst_nptr != NULL) { // .... processing, e.g. print( bst_nptr.key ); bst_nptr = bst_nptr.next; } */ #include "bst_tree.h" #include #include #include void bst_node_init( struct bst_tree *tptr ) { tptr->key_bits = NULL; tptr->num_key_bytes = 0; tptr->num_key_bits_this_node = 0; tptr->is_set = 0; tptr->duplicate_count = 0; tptr->data_ptr = NULL; tptr->lchild = NULL; tptr->rchild = NULL; tptr->next = NULL; } struct bst_tree *bst_scan( struct bst_tree *bst_nptr, struct bst_tree *curr_nptr ); //------------------------------------------------------------------------------------- // Insert a data item into a Bitstream tree. Returns 0 on success, or 1 if a duplicate. // // For text keys count the number of characters excluding the null as 'num_key_bytes'. //------------------------------------------------------------------------------------- int bst_insert( struct bst_tree *tree_nptr, char *key_bits, int num_key_bytes, void *data_ptr ) { int bit, bit1; int num_key_bits; int tree_num_bits; int bit_pos; struct bst_tree *bst_tree, *bst_nptr, *bst_nptr2; int finished; int result; num_key_bits = num_key_bytes * 8; bst_nptr = tree_nptr; bit_pos = 0; finished = false; // continue descent until reaching the end of the node's bit sequence, the insert key number of bits, or key mismatch // cases: insert key is shorter and matches // insert key is longer and matches // exact match of keys (i.e. a duplicate key) // bit mismatch bit = bget_bit( key_bits, num_key_bytes, 0 ); if (bit == 0 && tree_nptr->lchild == NULL) { bst_nptr->lchild = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr->lchild ); bst_nptr->lchild->num_key_bits_this_node = num_key_bits; bst_nptr = bst_nptr->lchild; } else if (bit == 1 && tree_nptr->rchild == NULL) { bst_nptr->rchild = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr->rchild ); bst_nptr->rchild->num_key_bits_this_node = num_key_bits; bst_nptr = bst_nptr->rchild; } else { bit = bget_bit( key_bits, num_key_bytes, 0 ); if (bit == 0) bst_nptr = bst_nptr->lchild; else bst_nptr = bst_nptr->rchild; tree_num_bits = bst_nptr->num_key_bits_this_node; while (! finished) { if (bit_pos == num_key_bits) { if (bst_nptr->is_set) { if (num_key_bytes == bst_nptr->num_key_bytes && memcmp( key_bits, bst_nptr->key_bits, num_key_bytes ) == 0) finished = true; } } if (! finished) { if (bit_pos >= tree_num_bits) // match, and insert key is longer than key bits processed so far { bit = bget_bit( key_bits, num_key_bytes, bit_pos ); if (bit == 0) { if (bst_nptr->lchild == NULL) { bst_nptr->lchild = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr->lchild ); bst_nptr->lchild->num_key_bits_this_node = num_key_bits - bit_pos; bst_nptr = bst_nptr->lchild; finished = true; } else { bst_nptr = bst_nptr->lchild; tree_num_bits += bst_nptr->num_key_bits_this_node; } } else { if (bst_nptr->rchild == NULL) { bst_nptr->rchild = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr->rchild ); bst_nptr->rchild->num_key_bits_this_node = num_key_bits - bit_pos; bst_nptr = bst_nptr->rchild; finished = true; } else { bst_nptr = bst_nptr->rchild; tree_num_bits += bst_nptr->num_key_bits_this_node; } } } else if (bit_pos >= num_key_bits) // match, and insert key is shorter than currently stored key { bit1 = bget_bit( bst_nptr->key_bits, num_key_bits, bit_pos ); if (bit1 == 0) { bst_nptr2 = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr2 ); bst_nptr2->is_set = bst_nptr->is_set; bst_nptr2->num_key_bits_this_node = tree_num_bits - bit_pos; bst_nptr2->lchild = bst_nptr->lchild; bst_nptr2->rchild = bst_nptr->rchild; bst_nptr2->key_bits = malloc( bst_nptr->num_key_bytes ); memcpy( bst_nptr2->key_bits, bst_nptr->key_bits, bst_nptr->num_key_bytes ); bst_nptr2->num_key_bytes = bst_nptr->num_key_bytes; bst_nptr2->data_ptr = bst_nptr->data_ptr; bst_nptr2->duplicate_count = bst_nptr->duplicate_count; bst_nptr->lchild = bst_nptr2; bst_nptr->rchild = NULL; bst_nptr->is_set = false; bst_nptr->num_key_bits_this_node -= bst_nptr2->num_key_bits_this_node; finished = true; } else { bst_nptr2 = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr2 ); bst_nptr2->is_set = bst_nptr->is_set; bst_nptr2->num_key_bits_this_node = tree_num_bits - bit_pos; bst_nptr2->lchild = bst_nptr->lchild; bst_nptr2->rchild = bst_nptr->rchild; bst_nptr2->key_bits = malloc( bst_nptr->num_key_bytes ); memcpy( bst_nptr2->key_bits, bst_nptr->key_bits, bst_nptr->num_key_bytes ); bst_nptr2->num_key_bytes = bst_nptr->num_key_bytes; bst_nptr2->data_ptr = bst_nptr->data_ptr; bst_nptr2->duplicate_count = bst_nptr->duplicate_count; bst_nptr->rchild = bst_nptr2; bst_nptr->lchild = NULL; bst_nptr->is_set = false; bst_nptr->num_key_bits_this_node -= bst_nptr2->num_key_bits_this_node; finished = true; } } else { bit = bget_bit( key_bits, bit_pos ); bit1 = bget_bit( bst_nptr->key_bits, bit_pos ); if (bit == bit1) // progressing though the key, match bit_pos++; else { // bit pattern mismatch if (bit == 0) { bst_nptr2 = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr2 ); bst_nptr2->is_set = bst_nptr->is_set; bst_nptr2->num_key_bits_this_node = tree_num_bits - bit_pos; bst_nptr2->lchild = bst_nptr->lchild; bst_nptr2->rchild = bst_nptr->rchild; bst_nptr2->key_bits = malloc( bst_nptr->num_key_bytes ); memcpy( bst_nptr2->key_bits, bst_nptr->key_bits, bst_nptr->num_key_bytes ); bst_nptr2->num_key_bytes = bst_nptr->num_key_bytes; bst_nptr2->data_ptr = bst_nptr->data_ptr; bst_nptr2->duplicate_count = bst_nptr->duplicate_count; bst_nptr->rchild = bst_nptr2; bst_nptr->lchild = malloc( sizeof( struct bst_tree ) ); bst_nptr->lchild->is_set = false; bst_nptr->lchild->lchild = NULL; bst_nptr->lchild->rchild = NULL; bst_nptr->lchild->num_key_bits_this_node = num_key_bits - bit_pos; bst_nptr->is_set = false; bst_nptr->num_key_bits_this_node -= bst_nptr2->num_key_bits_this_node; bst_nptr = bst_nptr->lchild; finished = true; } else { bst_nptr2 = malloc( sizeof( struct bst_tree ) ); bst_node_init( bst_nptr2 ); bst_nptr2->is_set = bst_nptr->is_set; bst_nptr2->num_key_bits_this_node = tree_num_bits - bit_pos; bst_nptr2->lchild = bst_nptr->lchild; bst_nptr2->rchild = bst_nptr->rchild; bst_nptr2->key_bits = malloc( bst_nptr->num_key_bytes ); memcpy( bst_nptr2->key_bits, bst_nptr->key_bits, bst_nptr->num_key_bytes ); bst_nptr2->num_key_bytes = bst_nptr->num_key_bytes; bst_nptr2->data_ptr = bst_nptr->data_ptr; bst_nptr2->duplicate_count = bst_nptr->duplicate_count; bst_nptr->lchild = bst_nptr2; bst_nptr->rchild = malloc( sizeof( struct bst_tree ) ); bst_nptr->rchild->is_set = false; bst_nptr->rchild->lchild = NULL; bst_nptr->rchild->rchild = NULL; bst_nptr->rchild->num_key_bits_this_node = num_key_bits - bit_pos; bst_nptr->is_set = false; bst_nptr->num_key_bits_this_node -= bst_nptr2->num_key_bits_this_node; bst_nptr = bst_nptr->rchild; finished = true; } } } } } } if (bst_nptr->is_set) { if (bst_nptr->num_key_bytes != num_key_bytes || memcmp( bst_nptr->key_bits, key_bits, num_key_bytes ) != 0) printf( "Internal error, duplicate key not matching: keys %s and %s", bst_nptr->key_bits, key_bits ); bst_nptr->duplicate_count++; result = 1; } else { result = 0; bst_nptr->duplicate_count = 0; bst_nptr->is_set = true; if (bst_nptr->key_bits != NULL) free( bst_nptr->key_bits ); bst_nptr->key_bits = malloc( num_key_bytes ); memcpy( bst_nptr->key_bits, key_bits, num_key_bytes ); bst_nptr->num_key_bytes = num_key_bytes; bst_nptr->data_ptr = data_ptr; } return (result); } //------------------------------------------------------------------------------------- // Search for an item in a tree. Returns a pointer to the tree node if found otherwise NULL. //------------------------------------------------------------------------------------- struct bst_tree *bst_search( struct bst_tree *tree_nptr, char *key_bits, int num_key_bytes, int *found ) { int bit, bit1; int num_key_bits; int tree_num_bits; int bit_pos; struct bst_tree *bst_tree, *bst_nptr, *bst_nptr2; int finished; struct bst_tree *result; num_key_bits = num_key_bytes * 8; bst_nptr = tree_nptr; bit_pos = 0; *found = false; finished = false; bit = bget_bit( key_bits, num_key_bytes, 0 ); if (bit == 0) bst_nptr = bst_nptr->lchild; else bst_nptr = bst_nptr->rchild; tree_num_bits = bst_nptr->num_key_bits_this_node; while (! finished) { if (bit_pos == num_key_bits) { if (bst_nptr->is_set) { if (num_key_bytes == bst_nptr->num_key_bytes && memcmp( key_bits, bst_nptr->key_bits, num_key_bytes ) == 0) *found = true; } finished = true; } if (! finished) { if (bit_pos >= tree_num_bits) { bit = bget_bit( key_bits, num_key_bytes, bit_pos ); if (bit == 0) { if (bst_nptr->lchild == NULL) finished = true; else bst_nptr = bst_nptr->lchild; } else { if (bst_nptr->rchild == NULL) finished = true; else bst_nptr = bst_nptr->rchild; } tree_num_bits += bst_nptr->num_key_bits_this_node; } } bit_pos++; } if (*found) result = bst_nptr; else result = NULL; return (result); } //------------------------------------------------------------------------------------- // Returns true if the key was found in the tree. //------------------------------------------------------------------------------------- int bst_key_is_in_tree( struct bst_tree *tree_nptr, char *key_bits, int num_key_bytes ) { int bit, bit1; int num_key_bits; int tree_num_bits; int bit_pos; struct bst_tree *bst_tree, *bst_nptr, *bst_nptr2; int finished; struct bst_tree *result; int found; num_key_bits = num_key_bytes * 8; bst_nptr = tree_nptr; bit_pos = 0; found = false; finished = false; bit = bget_bit( key_bits, num_key_bytes, 0 ); if (bit == 0) bst_nptr = bst_nptr->lchild; else bst_nptr = bst_nptr->rchild; tree_num_bits = bst_nptr->num_key_bits_this_node; while (! finished) { if (bit_pos == num_key_bits) { if (bst_nptr->is_set) { if (num_key_bytes == bst_nptr->num_key_bytes && memcmp( key_bits, bst_nptr->key_bits, num_key_bytes ) == 0) found = true; } finished = true; } if (! finished) { if (bit_pos >= tree_num_bits) { bit = bget_bit( key_bits, num_key_bytes, bit_pos ); if (bit == 0) { if (bst_nptr->lchild == NULL) finished = true; else bst_nptr = bst_nptr->lchild; } else { if (bst_nptr->rchild == NULL) finished = true; else bst_nptr = bst_nptr->rchild; } tree_num_bits += bst_nptr->num_key_bits_this_node; } } bit_pos++; } return (found); } //------------------------------------------------------------------------------------- // Delete an item from a tree. Returns true if the item was found. //------------------------------------------------------------------------------------- int bst_delete_item( struct bst_tree *tree_nptr, char *key_bits, int num_key_bytes ) { int bit, bit1; int num_key_bits; int tree_num_bits; int bit_pos; struct bst_tree *bst_tree, *bst_nptr, *bst_nptr2; int finished; struct bst_tree *result; int found; num_key_bits = num_key_bytes * 8; bst_nptr = tree_nptr; bit_pos = 0; found = false; finished = false; bit = bget_bit( key_bits, num_key_bytes, 0 ); if (bit == 0) bst_nptr = bst_nptr->lchild; else bst_nptr = bst_nptr->rchild; tree_num_bits = bst_nptr->num_key_bits_this_node; while (! finished) { if (bit_pos == num_key_bits) { if (bst_nptr->is_set) { if (num_key_bytes == bst_nptr->num_key_bytes && memcmp( key_bits, bst_nptr->key_bits, num_key_bytes ) == 0) { bst_nptr->is_set = false; found = true; } } finished = true; } if (! finished) { if (bit_pos >= tree_num_bits) { bit = bget_bit( key_bits, num_key_bytes, bit_pos ); if (bit == 0) { if (bst_nptr->lchild == NULL) finished = true; else bst_nptr = bst_nptr->lchild; } else { if (bst_nptr->rchild == NULL) finished = true; else bst_nptr = bst_nptr->rchild; } tree_num_bits += bst_nptr->num_key_bits_this_node; } } bit_pos++; } return (found); } //------------------------------------------------------------------------------------- // Print a tree. //------------------------------------------------------------------------------------- void bst_preorder( struct bst_tree *bst_nptr ) { char str[1000]; if (bst_nptr->is_set) { memcpy( str, bst_nptr->key_bits, bst_nptr->num_key_bytes ); str[bst_nptr->num_key_bytes] = '\0'; // if (bst_nptr->data_ptr != NULL) // printf( "%s: %s\n", str, bst_nptr->data_ptr ); // else printf( "%s\n", str ); } if (bst_nptr->lchild != NULL) bst_preorder( bst_nptr->lchild ); if (bst_nptr->rchild != NULL) bst_preorder( bst_nptr->rchild ); } //------------------------------------------------------------------------------------- // Call this function before a first/next scan loop. //------------------------------------------------------------------------------------- void bst_prep_for_scan( struct bst_tree *bst_nptr ) { struct bst_tree *curr_nptr; curr_nptr = bst_nptr; curr_nptr = bst_scan( bst_nptr, curr_nptr ); curr_nptr->next = NULL; } struct bst_tree *bst_scan( struct bst_tree *bst_nptr, struct bst_tree *curr_nptr ) { if (bst_nptr->is_set) { curr_nptr->next = bst_nptr; curr_nptr = bst_nptr; } if (bst_nptr->lchild != NULL) curr_nptr = bst_scan( bst_nptr->lchild, curr_nptr ); if (bst_nptr->rchild != NULL) curr_nptr = bst_scan( bst_nptr->rchild, curr_nptr ); return (curr_nptr); } int bget_bit( char *key_bits, int num_key_bytes, int pos ) { int num; int pos1, bit_pos; pos1 = (int) (pos / 8); bit_pos = pos - (pos1 * 8); if (pos1 >= num_key_bytes) num = 0; else { num = 0; if (bit_pos == 0) {if ((key_bits[pos1] & 0x80) != 0) num = 1; } else if (bit_pos == 1) {if ((key_bits[pos1] & 0x40) != 0) num = 1; } else if (bit_pos == 2) {if ((key_bits[pos1] & 0x20) != 0) num = 1; } else if (bit_pos == 3) {if ((key_bits[pos1] & 0x10) != 0) num = 1; } else if (bit_pos == 4) {if ((key_bits[pos1] & 0x08) != 0) num = 1; } else if (bit_pos == 5) {if ((key_bits[pos1] & 0x04) != 0) num = 1; } else if (bit_pos == 6) {if ((key_bits[pos1] & 0x02) != 0) num = 1; } else if (bit_pos == 7) {if ((key_bits[pos1] & 0x01) != 0) num = 1; } } return (num); } Part 2 Variations Collating sequences, such as changing A < a to a < A could be done by mapping the input key bits to a different set of codes. This structure is primarily intended for in-memory structures. For databases, a node and all the, say, 32 nodes below it could be combined into a single block and written/read from disk in a single operation. Preparation for a tree scan: curr_node = tree_root nptr = tree_root scan( nptr ) { curr_node.next_ptr = nptr; curr_node = nptr; if (nptr has left child) scan( nptr-left_child ) if (nptr has right child) scan( nptr-right_child ) } Then nptr = tree_root while (nptr.next != NULL) { ... processing nptr = nptr.next }