001 /*
002 * Trie.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 /**
035 * This class implements a trie, or a digital search tree. A trie is a multiway tree
036 * (each node can have multiple children) that represents a set of strings.
037 *
038 * <P>Each node contains data encapsulated in an object instance. Each edge spells out a
039 * character and each path from the root represents a string described by the characters
040 * labelling the traversed edges. Moreover, for each string represented, there is a unique
041 * path from the root.</P>
042 *
043 * <P>The trie of the following example represents the strings 'a', 'd', 'b', 'ac', 'ba',
044 * 'be', 'bd', 'bad' and 'bae'.</P>
045 *
046 * <CODE><BLOCKQUOTE><PRE>
047 * [0]
048 * --+--
049 * / | \
050 * a/ d| \b
051 * [1] [2] [4]
052 * | --+--
053 * | / | \
054 * c| a/ e| d\
055 * [3] [5] [6] [9]
056 * --+--
057 * / \
058 * d/ e\
059 * [7] [8]
060 * </PRE></BLOCKQUOTE></CODE>
061 *
062 * <P>It is easy to see that strings with common prefixes will branch off from each other
063 * at the first distinguishing character. This feature makes the trie a good data
064 * structure to identify and represent phrases of a text such as the ones induced by the
065 * Lempel-Ziv familiy of compression algorithms. For instance, the LZ78 version parses
066 * the text into phrases, where each phrase is the longest matching phrase seen previously
067 * plus one character.</P>
068 *
069 * <P>In this implementation, each node is actually an instance of this class. To build a
070 * trie, one must first create the root using the public constructor:</P>
071 *
072 * <CODE><BLOCKQUOTE><PRE>
073 * Trie root = new Trie (some_object);
074 * </PRE></BLOCKQUOTE></CODE>
075 *
076 * <P>Here <CODE>some_object</CODE> contains any relevant information encapsulated in an
077 * object instance. Typically, that's the only moment the public constructor is used. From
078 * now on, all new nodes will be added as a new child of one existing node using the
079 * <CODE>add</CODE> method:</P>
080 *
081 * <CODE><BLOCKQUOTE><PRE>
082 * new_node = any_node.add (some_object, character);
083 * </PRE></BLOCKQUOTE></CODE>
084 *
085 * <P>Here <CODE>character</CODE> is the character that will label the edge from
086 * <CODE>any_node</CODE> to <CODE>new_node</CODE>. Note that this transition must not
087 * already exist, otherwise an exception is raised.
088 *
089 * <P>To find the longest prefix of a given string, we follow a path from the root down
090 * the tree, character by character, with the <CODE>spellDown</CODE> method:</P>
091 *
092 * <CODE><BLOCKQUOTE><PRE>
093 * next_node = root;
094 * while (next_node != null)
095 * {
096 * current_node = next_node;
097 * char c = get next character from somewhere
098 * <B>next_node = current_node.spellDown (c);</B>
099 * }
100 * </PRE></BLOCKQUOTE></CODE>
101 *
102 * <P><CODE>spellDown</CODE> follows the edge out of <CODE>current_node</CODE> labelled by
103 * the character <CODE>c</CODE> and returns the next node. If there is no such a path, it
104 * returns null.</P>
105 *
106 * <P>To retrieve the information stored at any node, simply use the <CODE>getData</CODE>
107 * method.</P>
108 *
109 * <P>In fact, there are many ways to implement a trie. To avoid wasting space with
110 * multiple pointers at each node, this implementation uses an approach with a linked list
111 * of siblings. Each node actually contains a pointer to one of its children and a pointer
112 * to one of its siblings only. Together with the pointers, each node also stores the
113 * character that labels the edge to the pointed node.<P>
114 *
115 * <CODE><BLOCKQUOTE><PRE>
116 * [0]
117 * |
118 * a| d b
119 * [1]---[2]---[4]
120 * | |
121 * c| a| e d
122 * [3] [5]---[6]---[9]
123 * |
124 * d| e
125 * [7]---[8]
126 * </PRE></BLOCKQUOTE></CODE>
127 *
128 * <P>In this way, a trie is similar to a binary tree. Although this implementation is
129 * more efficient in terms of space, the search for a label with a given character leaving
130 * a node <CODE>n</CODE> is no more constant but proportional to the number of children of
131 * <CODE>n</CODE>. In the previous example, it is necessary to traverse three edges to
132 * reach node 9 from node 4 with character d.</P>
133 *
134 * <P>This class is used by the {@linkplain FactorSequence} to build a linked list of
135 * factors of a sequence in a LZ78 fashion, i.e. where each factor is the longest factor
136 * previously seen plus one character.</P>
137 *
138 * @author Sergio A. de Carvalho Jr.
139 * @see FactorSequence
140 */
141 public class Trie
142 {
143 /**
144 * A pointer to the first of this node's children.
145 */
146 protected Trie son;
147
148 /**
149 * The character that labels the edge from this node to the child node pointer by
150 * <CODE>son</CODE>.
151 */
152 protected char to_son;
153
154 /**
155 * A pointer to this node's next sibling.
156 */
157 protected Trie sibling;
158
159 /**
160 * The character that labels the edge from this node to the sibling pointer by
161 * <CODE>sibling</CODE>.
162 */
163 protected char to_sibling;
164
165 /**
166 * This node's stored data.
167 */
168 protected Object data;
169
170 /**
171 * Creates a new trie node with the specified data. This constructor is typically used
172 * by the client only once to instantiate the root node. After that, all new nodes are
173 * implicitly instantiated by the <CODE>add</CODE> method.
174 *
175 * @param data the data that will be associated with the new node
176 */
177 public Trie (Object data)
178 {
179 this.son = null;
180 this.sibling = null;
181 this.data = data;
182 }
183
184 /**
185 * Returns the data associated with this node.
186 *
187 * @return data associated with this node
188 */
189 public Object getData ()
190 {
191 return data;
192 }
193
194 /**
195 * Adds a new child to this node. The new node will be implicitly instantiated with
196 * the <CODE>data</CODE> argument, and the edge from this node to the new node will be
197 * labelled by the character argument. If this node already have an edge labelled with
198 * this character, an exception is raised. Otherwise, the new node created and
199 * returned.
200 *
201 * <P>If this node have no child, a new node is created straight away. Otherwise, the
202 * task is assigned to its first child that will add the new node as a sibling.</P>
203 *
204 * @param data the data that will be associated with the new node
205 * @param c the character that will label the edge from this node to the new node
206 * @return the added node
207 * @throws IllegalStateException if this node already have an edge labelled by
208 * <CODE>c</CODE>
209 */
210 public Trie add (Object data, char c)
211 {
212 if (son == null)
213 {
214 son = new Trie (data);
215 to_son = c;
216 return son;
217 }
218 else
219 {
220 if (to_son != c)
221 return son.addSibling (data, c);
222 else
223 // duplicate char
224 throw new IllegalStateException ("Failed to add character " + c +
225 " already exists.");
226 }
227 }
228
229 /**
230 * Adds a sibling to this node. The new node will be implicitly instantiated with
231 * the <CODE>data</CODE> argument, and the edge from this node to the new node will be
232 * labelled by the character argument. If this node already have a sibling with this
233 * character, an exception is raised. Otherwise, the new node is created and returned.
234 *
235 * <P>If this node have no direct sibling, a new node is created straight away.
236 * Otherwise, the task is assigned to its next sibling.</P>
237 *
238 * @param data the data that will be associated with the new node
239 * @param c the character that will label the edge from this node to the new node
240 * @return the added node
241 * @throws IllegalStateException if this node already have an edge labelled by
242 * <CODE>c</CODE>
243 */
244 protected Trie addSibling (Object data, char c)
245 {
246 if (sibling == null)
247 {
248 sibling = new Trie (data);
249 to_sibling = c;
250 return sibling;
251 }
252 else
253 {
254 if (to_sibling != c)
255 return sibling.addSibling (data, c);
256 else
257 // duplicate char
258 throw new IllegalStateException ("Failed to add character: " + c +
259 " already exists.");
260 }
261 }
262
263 /**
264 * Follows a path from this node to one of its children by spelling the character
265 * supplied as an argument. If there is no such a path, <CODE>null</CODE> is returned.
266 * Otherwise, the reached child node is returned.
267 *
268 * <P>If this node's direct child is reached with an edge labelled by the character,
269 * it is returned straight away. Otherwise, it is assigned the task of finding another
270 * sibling labelled with that character.</P>
271 *
272 * @param c the character that labels the path to be followed to this node's child
273 * @return the child node reached by traversing the edge labelled by <CODE>c</CODE>
274 */
275 public Trie spellDown (char c)
276 {
277 if (son == null) return null;
278
279 if (to_son == c)
280 return son;
281 else
282 return son.spellRight(c);
283 }
284
285 /**
286 * Follows a path from this node to one of its sibling by spelling the character
287 * supplied as an argument. If there is no such a path, <CODE>null</CODE> is returned.
288 * Otherwise, the reached sibling node is returned.
289 *
290 * <P>If this node's direct sibling is reached with an edge labelled by the character,
291 * it is returned straight away. Otherwise, it is assigned the task of finding another
292 * sibling labelled with that character.</P>
293 *
294 * @param c the character that labels the path to be followed to the sibling
295 * @return the sibling node reached by traversing the edge labelled by <CODE>c</CODE>
296 */
297 protected Trie spellRight (char c)
298 {
299 if (sibling == null) return null;
300
301 if (to_sibling == c)
302 return sibling;
303 else
304 return sibling.spellRight(c);
305 }
306 }