ICU 49.1.1  49.1.1
stringtriebuilder.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
13 */
14 
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17 
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20 
21 // Forward declaration.
22 struct UHashtable;
23 typedef struct UHashtable UHashtable;
24 
29 enum UStringTrieBuildOption {
34  USTRINGTRIE_BUILD_FAST,
45  USTRINGTRIE_BUILD_SMALL
46 };
47 
49 
57 public:
58 #ifndef U_HIDE_INTERNAL_API
59 
60  static UBool hashNode(const void *node);
62  static UBool equalNodes(const void *left, const void *right);
63 #endif /* U_HIDE_INTERNAL_API */
64 
65 protected:
66  // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
67  // or else the compiler will create a public default constructor.
71  virtual ~StringTrieBuilder();
72 
73 #ifndef U_HIDE_INTERNAL_API
74 
75  void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
77  void deleteCompactBuilder();
78 
80  void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
81 
83  int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
85  int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
86 #endif /* U_HIDE_INTERNAL_API */
87 
88  class Node;
89 
90 #ifndef U_HIDE_INTERNAL_API
91 
92  Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
94  Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
95  int32_t length, UErrorCode &errorCode);
96 #endif /* U_HIDE_INTERNAL_API */
97 
99  virtual int32_t getElementStringLength(int32_t i) const = 0;
101  virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
103  virtual int32_t getElementValue(int32_t i) const = 0;
104 
105  // Finds the first unit index after this one where
106  // the first and last element have different units again.
108  virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
109 
110  // Number of different units at unitIndex.
112  virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
114  virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
116  virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
117 
119  virtual UBool matchNodesCanHaveValues() const = 0;
120 
122  virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
124  virtual int32_t getMinLinearMatch() const = 0;
126  virtual int32_t getMaxLinearMatchLength() const = 0;
127 
128 #ifndef U_HIDE_INTERNAL_API
129  // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
131  static const int32_t kMaxBranchLinearSubNodeLength=5;
132 
133  // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
134  // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
136  static const int32_t kMaxSplitBranchLevels=14;
137 
148  Node *registerNode(Node *newNode, UErrorCode &errorCode);
159  Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
160 
161  /*
162  * C++ note:
163  * registerNode() and registerFinalValue() take ownership of their input nodes,
164  * and only return owned nodes.
165  * If they see a failure UErrorCode, they will delete the input node.
166  * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
167  * If there is a failure, they return NULL.
168  *
169  * NULL Node pointers can be safely passed into other Nodes because
170  * they call the static Node::hashCode() which checks for a NULL pointer first.
171  *
172  * Therefore, as long as builder functions register a new node,
173  * they need to check for failures only before explicitly dereferencing
174  * a Node pointer, or before setting a new UErrorCode.
175  */
176 
177  // Hash set of nodes, maps from nodes to integer 1.
179  UHashtable *nodes;
180 
182  class Node : public UObject {
183  public:
184  Node(int32_t initialHash) : hash(initialHash), offset(0) {}
185  inline int32_t hashCode() const { return hash; }
186  // Handles node==NULL.
187  static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
188  // Base class operator==() compares the actual class types.
189  virtual UBool operator==(const Node &other) const;
190  inline UBool operator!=(const Node &other) const { return !operator==(other); }
218  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
219  // write() must set the offset to a positive value.
220  virtual void write(StringTrieBuilder &builder) = 0;
221  // See markRightEdgesFirst.
222  inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
223  StringTrieBuilder &builder) {
224  // Note: Edge numbers are negative, lastRight<=firstRight.
225  // If offset>0 then this node and its sub-nodes have been written already
226  // and we need not write them again.
227  // If this node is part of the unwritten right branch edge,
228  // then we wait until that is written.
229  if(offset<0 && (offset<lastRight || firstRight<offset)) {
230  write(builder);
231  }
232  }
233  inline int32_t getOffset() const { return offset; }
234  protected:
235  int32_t hash;
236  int32_t offset;
237  private:
238  // No ICU "poor man's RTTI" for this class nor its subclasses.
239  virtual UClassID getDynamicClassID() const;
240  };
241 
242  // This class should not be overridden because
243  // registerFinalValue() compares a stack-allocated FinalValueNode
244  // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
245  // with the input node, and the
246  // !Node::operator==(other) used inside FinalValueNode::operator==(other)
247  // will be false if the typeid's are different.
249  class FinalValueNode : public Node {
250  public:
251  FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
252  virtual UBool operator==(const Node &other) const;
253  virtual void write(StringTrieBuilder &builder);
254  protected:
255  int32_t value;
256  };
257 
259  class ValueNode : public Node {
260  public:
261  ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
262  virtual UBool operator==(const Node &other) const;
263  void setValue(int32_t v) {
264  hasValue=TRUE;
265  value=v;
266  hash=hash*37+v;
267  }
268  protected:
269  UBool hasValue;
270  int32_t value;
271  };
272 
275  public:
276  IntermediateValueNode(int32_t v, Node *nextNode)
277  : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
278  virtual UBool operator==(const Node &other) const;
279  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
280  virtual void write(StringTrieBuilder &builder);
281  protected:
282  Node *next;
283  };
284 
286  class LinearMatchNode : public ValueNode {
287  public:
288  LinearMatchNode(int32_t len, Node *nextNode)
289  : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
290  length(len), next(nextNode) {}
291  virtual UBool operator==(const Node &other) const;
292  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
293  protected:
294  int32_t length;
295  Node *next;
296  };
297 
299  class BranchNode : public Node {
300  public:
301  BranchNode(int32_t initialHash) : Node(initialHash) {}
302  protected:
303  int32_t firstEdgeNumber;
304  };
305 
307  class ListBranchNode : public BranchNode {
308  public:
309  ListBranchNode() : BranchNode(0x444444), length(0) {}
310  virtual UBool operator==(const Node &other) const;
311  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
312  virtual void write(StringTrieBuilder &builder);
313  // Adds a unit with a final value.
314  void add(int32_t c, int32_t value) {
315  units[length]=(UChar)c;
316  equal[length]=NULL;
317  values[length]=value;
318  ++length;
319  hash=(hash*37+c)*37+value;
320  }
321  // Adds a unit which leads to another match node.
322  void add(int32_t c, Node *node) {
323  units[length]=(UChar)c;
324  equal[length]=node;
325  values[length]=0;
326  ++length;
327  hash=(hash*37+c)*37+hashCode(node);
328  }
329  protected:
330  Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
331  int32_t length;
332  int32_t values[kMaxBranchLinearSubNodeLength];
333  UChar units[kMaxBranchLinearSubNodeLength];
334  };
335 
337  class SplitBranchNode : public BranchNode {
338  public:
339  SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
340  : BranchNode(((0x555555*37+middleUnit)*37+
341  hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
342  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
343  virtual UBool operator==(const Node &other) const;
344  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
345  virtual void write(StringTrieBuilder &builder);
346  protected:
347  UChar unit;
348  Node *lessThan;
349  Node *greaterOrEqual;
350  };
351 
352  // Branch head node, for writing the actual node lead unit.
354  class BranchHeadNode : public ValueNode {
355  public:
356  BranchHeadNode(int32_t len, Node *subNode)
357  : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
358  length(len), next(subNode) {}
359  virtual UBool operator==(const Node &other) const;
360  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
361  virtual void write(StringTrieBuilder &builder);
362  protected:
363  int32_t length;
364  Node *next; // A branch sub-node.
365  };
366 #endif /* U_HIDE_INTERNAL_API */
367 
369  virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
370  Node *nextNode) const = 0;
371 
373  virtual int32_t write(int32_t unit) = 0;
375  virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
377  virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
379  virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
381  virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
382 
383 private:
384  // No ICU "poor man's RTTI" for this class nor its subclasses.
385  virtual UClassID getDynamicClassID() const;
386 };
387 
389 
390 #endif // __STRINGTRIEBUILDER_H__