/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 1997 - July 2008 CWI, August 2008 - 2022 MonetDB B.V. */ #ifndef _GDK_ATOMS_H_ #define _GDK_ATOMS_H_ /* atomFromStr returns the number of bytes of the input string that * were processed. atomToStr returns the length of the string * produced. Both functions return -1 on (any kind of) failure. If * *dst is not NULL, *len specifies the available space. If there is * not enough space, or if *dst is NULL, *dst will be freed (if not * NULL) and a new buffer will be allocated and returned in *dst. * *len will be set to reflect the actual size allocated. If * allocation fails, *dst will be NULL on return and *len is * undefined. In any case, if the function returns, *buf is either * NULL or a valid pointer and then *len is the size of the area *buf * points to. * * atomCmp returns a value less than zero/equal to zero/greater than * zer if the first argument points to a values which is deemed * smaller/equal to/larger than the value pointed to by the second * argument. * * atomHash calculates a hash function for the value pointed to by the * argument. */ #define IDLENGTH 64 /* maximum BAT id length */ typedef struct { /* simple attributes */ char name[IDLENGTH]; uint8_t storage; /* stored as another type? */ bool linear; /* atom can be ordered linearly */ uint16_t size; /* fixed size of atom */ /* automatically generated fields */ const void *atomNull; /* global nil value */ /* generic (fixed + varsized atom) ADT functions */ ssize_t (*atomFromStr) (const char *src, size_t *len, void **dst, bool external); ssize_t (*atomToStr) (char **dst, size_t *len, const void *src, bool external); void *(*atomRead) (void *dst, size_t *dstlen, stream *s, size_t cnt); gdk_return (*atomWrite) (const void *src, stream *s, size_t cnt); int (*atomCmp) (const void *v1, const void *v2); BUN (*atomHash) (const void *v); /* optional functions */ gdk_return (*atomFix) (const void *atom); gdk_return (*atomUnfix) (const void *atom); /* varsized atom-only ADT functions */ var_t (*atomPut) (BAT *, var_t *off, const void *src); void (*atomDel) (Heap *, var_t *atom); size_t (*atomLen) (const void *atom); gdk_return (*atomHeap) (Heap *, size_t); } atomDesc; #define MAXATOMS 128 gdk_export atomDesc BATatoms[MAXATOMS]; gdk_export int GDKatomcnt; gdk_export int ATOMallocate(const char *nme); gdk_export int ATOMindex(const char *nme); gdk_export str ATOMname(int id); gdk_export size_t ATOMlen(int id, const void *v); gdk_export void *ATOMnil(int id) __attribute__((__malloc__)); gdk_export int ATOMprint(int id, const void *val, stream *fd); gdk_export char *ATOMformat(int id, const void *val); gdk_export void *ATOMdup(int id, const void *val); /* * @- maximum atomic string lengths */ #define bitStrlen 8 #define bteStrlen 8 #define shtStrlen 12 #define intStrlen 24 #if SIZEOF_OID == SIZEOF_INT #define oidStrlen 24 #else #define oidStrlen 48 #endif #if SIZEOF_PTR == SIZEOF_INT #define ptrStrlen 24 #else #define ptrStrlen 48 #endif #define lngStrlen 48 #ifdef HAVE_HGE #define hgeStrlen 96 #endif #define fltStrlen 48 #define dblStrlen 96 /* * The system comes with the traditional atomic types: int (4 bytes), * bool(1 byte) and str (variable). In addition, we support the notion * of an OID type, which ensures uniqueness of its members. This * leads to the following type descriptor table. */ #ifdef HAVE_HGE gdk_export ssize_t hgeFromStr(const char *src, size_t *len, hge **dst, bool external); gdk_export ssize_t hgeToStr(str *dst, size_t *len, const hge *src, bool external); #endif gdk_export ssize_t lngFromStr(const char *src, size_t *len, lng **dst, bool external); gdk_export ssize_t lngToStr(str *dst, size_t *len, const lng *src, bool external); gdk_export ssize_t intFromStr(const char *src, size_t *len, int **dst, bool external); gdk_export ssize_t intToStr(str *dst, size_t *len, const int *src, bool external); gdk_export ssize_t batFromStr(const char *src, size_t *len, bat **dst, bool external); gdk_export ssize_t batToStr(str *dst, size_t *len, const bat *src, bool external); gdk_export ssize_t ptrFromStr(const char *src, size_t *len, ptr **dst, bool external); gdk_export ssize_t ptrToStr(str *dst, size_t *len, const ptr *src, bool external); gdk_export ssize_t bitFromStr(const char *src, size_t *len, bit **dst, bool external); gdk_export ssize_t bitToStr(str *dst, size_t *len, const bit *src, bool external); gdk_export ssize_t OIDfromStr(const char *src, size_t *len, oid **dst, bool external); gdk_export ssize_t OIDtoStr(str *dst, size_t *len, const oid *src, bool external); gdk_export ssize_t shtFromStr(const char *src, size_t *len, sht **dst, bool external); gdk_export ssize_t shtToStr(str *dst, size_t *len, const sht *src, bool external); gdk_export ssize_t bteFromStr(const char *src, size_t *len, bte **dst, bool external); gdk_export ssize_t bteToStr(str *dst, size_t *len, const bte *src, bool external); gdk_export ssize_t fltFromStr(const char *src, size_t *len, flt **dst, bool external); gdk_export ssize_t fltToStr(str *dst, size_t *len, const flt *src, bool external); gdk_export ssize_t dblFromStr(const char *src, size_t *len, dbl **dst, bool external); gdk_export ssize_t dblToStr(str *dst, size_t *len, const dbl *src, bool external); gdk_export ssize_t GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len); gdk_export ssize_t strFromStr(const char *restrict src, size_t *restrict len, str *restrict dst, bool external); gdk_export size_t escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote); gdk_export size_t escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote); /* * @- nil values * All types have a single value designated as a NIL value. It * designates a missing value and it is ignored (forbidden) in several * primitives. The current policy is to use the smallest value in any * ordered domain. The routine atomnil returns a pointer to the nil * value representation. */ #define GDK_bit_max ((bit) 1) #define GDK_bit_min ((bit) 0) #define GDK_bte_max ((bte) INT8_MAX) #define GDK_bte_min ((bte) INT8_MIN+1) #define GDK_sht_max ((sht) INT16_MAX) #define GDK_sht_min ((sht) INT16_MIN+1) #define GDK_int_max ((int) INT32_MAX) #define GDK_int_min ((int) INT32_MIN+1) #define GDK_lng_max ((lng) INT64_MAX) #define GDK_lng_min ((lng) INT64_MIN+1) #ifdef HAVE_HGE #define GDK_hge_max ((((hge) 1) << 126) - 1 + (((hge) 1) << 126)) #define GDK_hge_min (-GDK_hge_max) #endif #define GDK_flt_max ((flt) FLT_MAX) #define GDK_flt_min ((flt) -FLT_MAX) #define GDK_dbl_max ((dbl) DBL_MAX) #define GDK_dbl_min ((dbl) -DBL_MAX) #define GDK_oid_max (((oid) 1 << ((8 * SIZEOF_OID) - 1)) - 1) #define GDK_oid_min ((oid) 0) /* representation of the nil */ gdk_export const bte bte_nil; gdk_export const sht sht_nil; gdk_export const int int_nil; #ifdef NAN_CANNOT_BE_USED_AS_INITIALIZER /* Definition of NAN is seriously broken on Intel compiler (at least * in some versions), so we work around it. */ union _flt_nil_t { uint32_t l; flt f; }; gdk_export const union _flt_nil_t _flt_nil_; #define flt_nil (_flt_nil_.f) union _dbl_nil_t { uint64_t l; dbl d; }; gdk_export const union _dbl_nil_t _dbl_nil_; #define dbl_nil (_dbl_nil_.d) #else gdk_export const flt flt_nil; gdk_export const dbl dbl_nil; #endif gdk_export const lng lng_nil; #ifdef HAVE_HGE gdk_export const hge hge_nil; #endif gdk_export const oid oid_nil; gdk_export const char str_nil[2]; gdk_export const ptr ptr_nil; gdk_export const uuid uuid_nil; /* derived NIL values - OIDDEPEND */ #define bit_nil ((bit) bte_nil) #define bat_nil ((bat) int_nil) #define void_nil oid_nil #define is_bit_nil(v) ((v) == GDK_bte_min-1) #define is_bte_nil(v) ((v) == GDK_bte_min-1) #define is_sht_nil(v) ((v) == GDK_sht_min-1) #define is_int_nil(v) ((v) == GDK_int_min-1) #define is_lng_nil(v) ((v) == GDK_lng_min-1) #ifdef HAVE_HGE #define is_hge_nil(v) ((v) == GDK_hge_min-1) #endif #define is_oid_nil(v) ((v) == ((oid) 1 << ((8 * SIZEOF_OID) - 1))) #define is_flt_nil(v) isnan(v) #define is_dbl_nil(v) isnan(v) #define is_bat_nil(v) (((v) & 0x7FFFFFFF) == 0) /* v == bat_nil || v == 0 */ #include #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && _MSC_VER < 1800 #include #define isnan(x) _isnan(x) #define isinf(x) (_fpclass(x) & (_FPCLASS_NINF | _FPCLASS_PINF)) #define isfinite(x) _finite(x) #endif #ifdef HAVE_HGE #define is_uuid_nil(x) ((x).h == 0) #else #ifdef HAVE_UUID #define is_uuid_nil(x) uuid_is_null((x).u) #else #define is_uuid_nil(x) (memcmp((x).u, uuid_nil.u, UUID_SIZE) == 0) #endif #endif /* * @- Derived types * In all algorithms across GDK, you will find switches on the types * (bte, sht, int, flt, dbl, lng, hge, str). They respectively * represent an octet, a 16-bit int, a 32-bit int, a 32-bit float, a * 64-bit double, a 64-bit int, a 128-bit int, and a pointer-sized location * of a char-buffer (ended by a zero char). * * In contrast, the types (bit, ptr, bat, oid) are derived types. They * do not occur in the switches. The ATOMstorage macro maps them * respectively onto a @code{ bte}, @code{ int} (pointers are 32-bit), * @code{ int}, and @code{ int}. OIDs are 32-bit. * * This approach makes it tractable to switch to 64-bits OIDs, or to a * fully 64-bits OS easily. One only has to map the @code{ oid} and * @code{ ptr} types to @code{ lng} instead of @code{ int}. * * Derived types mimic their fathers in many ways. They inherit the * @code{ size}, @code{ linear}, and @code{ null} * properties of their father. The same goes for the * ADT functions HASH, CMP, PUT, NULL, DEL, LEN, and HEAP. So, a * derived type differs in only two ways from its father: * @table @code * @item [string representation] * the only two ADT operations specific for a derived type are FROMSTR * and TOSTR. * @item [identity] * (a @code{ bit} is really of a different type than @code{ bte}). The * set of operations on derived type values or BATs of such types may * differ from the sets of operations on the father type. * @end table */ /* use "do ... while(0)" so that lhs can safely be used in if statements */ #define ATOMstorage(t) BATatoms[t].storage #define ATOMsize(t) BATatoms[t].size #define ATOMfromstr(t,s,l,src,ext) BATatoms[t].atomFromStr(src,l,s,ext) #define ATOMnilptr(t) BATatoms[t].atomNull #define ATOMcompare(t) BATatoms[t].atomCmp #define ATOMcmp(t,l,r) ((*ATOMcompare(t))(l, r)) #define ATOMhash(t,src) BATatoms[t].atomHash(src) #define ATOMdel(t,hp,src) do if (BATatoms[t].atomDel) BATatoms[t].atomDel(hp,src); while (0) #define ATOMvarsized(t) (BATatoms[t].atomPut != NULL) #define ATOMlinear(t) BATatoms[t].linear #define ATOMtype(t) ((t) == TYPE_void ? TYPE_oid : (t)) #define ATOMfix(t,v) (BATatoms[t].atomFix ? BATatoms[t].atomFix(v) : GDK_SUCCEED) #define ATOMunfix(t,v) (BATatoms[t].atomUnfix ? BATatoms[t].atomUnfix(v) : GDK_SUCCEED) /* The base type is the storage type if the comparison function, the * hash function, and the nil value are the same as those of the * storage type; otherwise it is the type itself. */ #define ATOMbasetype(t) ((t) != ATOMstorage(t) && \ ATOMnilptr(t) == ATOMnilptr(ATOMstorage(t)) && \ ATOMcompare(t) == ATOMcompare(ATOMstorage(t)) && \ BATatoms[t].atomHash == BATatoms[ATOMstorage(t)].atomHash ? \ ATOMstorage(t) : (t)) /* * In case that atoms are added to a bat, their logical reference * count should be incremented (and decremented if deleted). Notice * that BATs with atomic types that have logical references (e.g. BATs * of BATs but also BATs of ODMG odSet) can never be persistent, as * this would make the commit tremendously complicated. */ static inline gdk_return __attribute__((__warn_unused_result__)) ATOMputVAR(BAT *b, var_t *dst, const void *src) { assert(BATatoms[b->ttype].atomPut != NULL); if ((*BATatoms[b->ttype].atomPut)(b, dst, src) == (var_t) -1) return GDK_FAIL; return GDK_SUCCEED; } static inline gdk_return __attribute__((__warn_unused_result__)) ATOMputFIX(int type, void *dst, const void *src) { gdk_return rc; assert(BATatoms[type].atomPut == NULL); rc = ATOMfix(type, src); if (rc != GDK_SUCCEED) return rc; switch (ATOMsize(type)) { case 0: /* void */ break; case 1: * (bte *) dst = * (bte *) src; break; case 2: * (sht *) dst = * (sht *) src; break; case 4: * (int *) dst = * (int *) src; break; case 8: * (lng *) dst = * (lng *) src; break; case 16: #ifdef HAVE_HGE * (hge *) dst = * (hge *) src; #else * (uuid *) dst = * (uuid *) src; #endif break; default: memcpy(dst, src, ATOMsize(type)); break; } return GDK_SUCCEED; } static inline gdk_return __attribute__((__warn_unused_result__)) ATOMreplaceVAR(BAT *b, var_t *dst, const void *src) { var_t loc = *dst; int type = b->ttype; assert(BATatoms[type].atomPut != NULL); if ((*BATatoms[type].atomPut)(b, &loc, src) == (var_t) -1) return GDK_FAIL; if (ATOMunfix(type, dst) != GDK_SUCCEED) return GDK_FAIL; ATOMdel(type, b->tvheap, dst); *dst = loc; return ATOMfix(type, src); } /* string heaps: * - strings are 8 byte aligned * - start with a 1024 bucket hash table * - heaps < 64KiB are fully duplicate eliminated with this hash tables * - heaps >= 64KiB are opportunistically (imperfect) duplicate * eliminated as only the last 128KiB chunk is considered and there * is no linked list * - buckets and next pointers are unsigned short "indices" * - indices should be multiplied by 8 and takes from ELIMBASE to get * an offset * Note that a 64KiB chunk of the heap contains at most 8K 8-byte * aligned strings. The 1K bucket list means that in worst load, the * list length is 8 (OK). */ #define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) #define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) #define GDK_ELIMPOWER 16 /* 64KiB is the threshold */ #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) #define GDK_ELIMLIMIT (1<> GDK_ELIMPOWER) << GDK_ELIMPOWER) #define GDK_VAROFFSET ((var_t) GDK_STRHASHSIZE) /* * @- String Comparison, NILs and UTF-8 * * Using the char* type for strings is handy as this is the type of * any constant strings in a C/C++ program. Therefore, MonetDB uses * this definition for str. However, different compilers and * platforms use either signed or unsigned characters for the char * type. It is required that string ordering in MonetDB is consistent * over platforms though. * * As for the choice how strings should be ordered, our support for * UTF-8 actually imposes that it should follow 'unsigned char' * doctrine (like in the AIX native compiler). In this semantics, * though we have to take corrective action to ensure that str(nil) is * the smallest value of the domain. */ static inline bool __attribute__((__pure__)) strEQ(const char *l, const char *r) { return strcmp(l, r) == 0; } static inline bool __attribute__((__pure__)) strNil(const char *s) { return s == NULL || (s[0] == '\200' && s[1] == '\0'); } static inline size_t __attribute__((__pure__)) strLen(const char *s) { return strNil(s) ? 2 : strlen(s) + 1; } static inline int __attribute__((__pure__)) strCmp(const char *l, const char *r) { return strNil(r) ? !strNil(l) : strNil(l) ? -1 : strcmp(l, r); } static inline size_t VarHeapVal(const void *b, BUN p, int w) { switch (w) { case 1: return (size_t) ((const uint8_t *) b)[p] + GDK_VAROFFSET; case 2: return (size_t) ((const uint16_t *) b)[p] + GDK_VAROFFSET; #if SIZEOF_VAR_T == 8 case 4: return (size_t) ((const uint32_t *) b)[p]; #endif default: return (size_t) ((const var_t *) b)[p]; } } static inline BUN __attribute__((__pure__)) strHash(const char *key) { BUN y = 0; for (BUN i = 0; key[i]; i++) { y += key[i]; y += (y << 10); y ^= (y >> 6); } y += (y << 3); y ^= (y >> 11); y += (y << 15); return y; } #endif /* _GDK_ATOMS_H_ */