C/C++,树算法——Ukkonen的“后缀树“构造算法的源程序

2023-12-05

1 文本格式

// A C program to implement Ukkonen's Suffix Tree Construction
// And then build generalized suffix tree
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAX_CHAR 256

struct SuffixTreeNode {
struct SuffixTreeNode *children[MAX_CHAR];

//pointer to other node via suffix link
struct SuffixTreeNode *suffixLink;

/*(start, end) interval specifies the edge, by which the
node is connected to its parent node. Each edge will
connect two nodes, one parent and one child, and
(start, end) interval of a given edge will be stored
in the child node. Lets say there are two nods A and B
connected by an edge with indices (5, 8) then this
indices (5, 8) will be stored in node B. */
int start;
int *end;

/*for leaf nodes, it stores the index of suffix for
the path from root to leaf*/
int suffixIndex;
};

typedef struct SuffixTreeNode Node;

char text[100]; //Input string
Node *root = NULL; //Pointer to root node

/*lastNewNode will point to newly created internal node,
waiting for it's suffix link to be set, which might get
a new suffix link (other than root) in next extension of
same phase. lastNewNode will be set to NULL when last
newly created internal node (if there is any) got it's
suffix link reset to new internal node created in next
extension of same phase. */
Node *lastNewNode = NULL;
Node *activeNode = NULL;

/*activeEdge is represented as input string character
index (not the character itself)*/
int activeEdge = -1;
int activeLength = 0;

// remainingSuffixCount tells how many suffixes yet to
// be added in tree
int remainingSuffixCount = 0;
int leafEnd = -1;
int *rootEnd = NULL;
int *splitEnd = NULL;
int size = -1; //Length of input string

Node *newNode(int start, int *end)
{
Node *node =(Node*) malloc(sizeof(Node));
int i;
for (i = 0; i < MAX_CHAR; i++)
node->children[i] = NULL;

/*For root node, suffixLink will be set to NULL
For internal nodes, suffixLink will be set to root
by default in current extension and may change in
next extension*/
node->suffixLink = root;
node->start = start;
node->end = end;

/*suffixIndex will be set to -1 by default and
actual suffix index will be set later for leaves
at the end of all phases*/
node->suffixIndex = -1;
return node;
}

int edgeLength(Node *n) {
if(n == root)
return 0;
return *(n->end) - (n->start) + 1;
}

int walkDown(Node *currNode)
{
/*activePoint change for walk down (APCFWD) using
Skip/Count Trick (Trick 1). If activeLength is greater
than current edge length, set next internal node as
activeNode and adjust activeEdge and activeLength
accordingly to represent same activePoint*/
if (activeLength >= edgeLength(currNode))
{
activeEdge += edgeLength(currNode);
activeLength -= edgeLength(currNode);
activeNode = currNode;
return 1;
}
return 0;
}

void extendSuffixTree(int pos)
{
/*Extension Rule 1, this takes care of extending all
leaves created so far in tree*/
leafEnd = pos;

/*Increment remainingSuffixCount indicating that a
new suffix added to the list of suffixes yet to be
added in tree*/
remainingSuffixCount++;

/*set lastNewNode to NULL while starting a new phase,
indicating there is no internal node waiting for
it's suffix link reset in current phase*/
lastNewNode = NULL;

//Add all suffixes (yet to be added) one by one in tree
while(remainingSuffixCount > 0) {

if (activeLength == 0)
activeEdge = pos; //APCFALZ

// There is no outgoing edge starting with
// activeEdge from activeNode
if (activeNode->children] == NULL)
{
//Extension Rule 2 (A new leaf edge gets created)
activeNode->children] =
newNode(pos, &leafEnd);

/*A new leaf edge is created in above line starting
from an existing node (the current activeNode), and
if there is any internal node waiting for it's suffix
link get reset, point the suffix link from that last
internal node to current activeNode. Then set lastNewNode
to NULL indicating no more node waiting for suffix link
reset.*/
if (lastNewNode != NULL)
{
lastNewNode->suffixLink = activeNode;
lastNewNode = NULL;
}
}
// There is an outgoing edge starting with activeEdge
// from activeNode
else
{
// Get the next node at the end of edge starting
// with activeEdge
Node *next = activeNode->children];
if (walkDown(next))//Do walkdown
{
//Start from next node (the new activeNode)
continue;
}
/*Extension Rule 3 (current character being processed
is already on the edge)*/
if (text[next->start + activeLength] == text[pos])
{
//If a newly created node waiting for it's
//suffix link to be set, then set suffix link
//of that waiting node to current active node
if(lastNewNode != NULL && activeNode != root)
{
lastNewNode->suffixLink = activeNode;
lastNewNode = NULL;
}

//APCFER3
activeLength++;
/*STOP all further processing in this phase
and move on to next phase*/
break;
}

/*We will be here when activePoint is in middle of
the edge being traversed and current character
being processed is not on the edge (we fall off
the tree). In this case, we add a new internal node
and a new leaf edge going out of that new node. This
is Extension Rule 2, where a new leaf edge and a new
internal node get created*/
splitEnd = (int*) malloc(sizeof(int));
*splitEnd = next->start + activeLength - 1;

//New internal node
Node *split = newNode(next->start, splitEnd);
activeNode->children] = split;

//New leaf coming out of new internal node
split->children] = newNode(pos, &leafEnd);
next->start += activeLength;
split->children] = next;

/*We got a new internal node here. If there is any
internal node created in last extensions of same
phase which is still waiting for it's suffix link
reset, do it now.*/
if (lastNewNode != NULL)
{
/*suffixLink of lastNewNode points to current newly
created internal node*/
lastNewNode->suffixLink = split;
}

/*Make the current newly created internal node waiting
for it's suffix link reset (which is pointing to root
at present). If we come across any other internal node
(existing or newly created) in next extension of same
phase, when a new leaf edge gets added (i.e. when
Extension Rule 2 applies is any of the next extension
of same phase) at that point, suffixLink of this node
will point to that internal node.*/
lastNewNode = split;
}

/* One suffix got added in tree, decrement the count of
suffixes yet to be added.*/
remainingSuffixCount--;
if (activeNode == root && activeLength > 0) //APCFER2C1
{
activeLength--;
activeEdge = pos - remainingSuffixCount + 1;
}
else if (activeNode != root) //APCFER2C2
{
activeNode = activeNode->suffixLink;
}
}
}

void print(int i, int j)
{
int k;
for (k=i; k<=j && text[k] != '#'; k++)
printf("%c", text[k]);
if(k<=j)
printf("#");
}

//Print the suffix tree as well along with setting suffix index
//So tree will be printed in DFS manner
//Each edge along with it's suffix index will be printed
void setSuffixIndexByDFS(Node *n, int labelHeight)
{
if (n == NULL) return;

if (n->start != -1) //A non-root node
{
//Print the label on edge from parent to current node
print(n->start, *(n->end));
}
int leaf = 1;
int i;
for (i = 0; i < MAX_CHAR; i++)
{
if (n->children[i] != NULL)
{
if (leaf == 1 && n->start != -1)
printf(" [%d]\n", n->suffixIndex);

//Current node is not a leaf as it has outgoing
//edges from it.
leaf = 0;
setSuffixIndexByDFS(n->children[i], labelHeight +
edgeLength(n->children[i]));
}
}
if (leaf == 1)
{
for(i= n->start; i<= *(n->end); i++)
{
if(text[i] == '#') //Trim unwanted characters
{
n->end = (int*) malloc(sizeof(int));
*(n->end) = i;
}
}
n->suffixIndex = size - labelHeight;
printf(" [%d]\n", n->suffixIndex);
}
}

void freeSuffixTreeByPostOrder(Node *n)
{
if (n == NULL)
return;
int i;
for (i = 0; i < MAX_CHAR; i++)
{
if (n->children[i] != NULL)
{
freeSuffixTreeByPostOrder(n->children[i]);
}
}
if (n->suffixIndex == -1)
free(n->end);
free(n);
}

/*Build the suffix tree and print the edge labels along with
suffixIndex. suffixIndex for leaf edges will be >= 0 and
for non-leaf edges will be -1*/
void buildSuffixTree()
{
size = strlen(text);
int i;
rootEnd = (int*) malloc(sizeof(int));
*rootEnd = - 1;

/*Root is a special node with start and end indices as -1,
as it has no parent from where an edge comes to root*/
root = newNode(-1, rootEnd);

activeNode = root; //First activeNode will be root
for (i=0; i<size; i++)
extendSuffixTree(i);
int labelHeight = 0;
setSuffixIndexByDFS(root, labelHeight);

//Free the dynamically allocated memory
freeSuffixTreeByPostOrder(root);
}

// driver program to test above functions
int main(int argc, char *argv[])
{
// strcpy(text, "xabxac#abcabxabcd$"); buildSuffixTree();
strcpy(text, "xabxa#babxba$"); buildSuffixTree();
return 0;
}

2 代码格式

// A C program to implement Ukkonen's Suffix Tree Construction
// And then build generalized suffix tree
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAX_CHAR 256

struct SuffixTreeNode {
	struct SuffixTreeNode *children[MAX_CHAR];

	//pointer to other node via suffix link
	struct SuffixTreeNode *suffixLink;

	/*(start, end) interval specifies the edge, by which the
	node is connected to its parent node. Each edge will
	connect two nodes, one parent and one child, and
	(start, end) interval of a given edge will be stored
	in the child node. Lets say there are two nods A and B
	connected by an edge with indices (5, 8) then this
	indices (5, 8) will be stored in node B. */
	int start;
	int *end;

	/*for leaf nodes, it stores the index of suffix for
	the path from root to leaf*/
	int suffixIndex;
};

typedef struct SuffixTreeNode Node;

char text[100]; //Input string
Node *root = NULL; //Pointer to root node

/*lastNewNode will point to newly created internal node,
waiting for it's suffix link to be set, which might get
a new suffix link (other than root) in next extension of
same phase. lastNewNode will be set to NULL when last
newly created internal node (if there is any) got it's
suffix link reset to new internal node created in next
extension of same phase. */
Node *lastNewNode = NULL;
Node *activeNode = NULL;

/*activeEdge is represented as input string character
index (not the character itself)*/
int activeEdge = -1;
int activeLength = 0;

// remainingSuffixCount tells how many suffixes yet to
// be added in tree
int remainingSuffixCount = 0;
int leafEnd = -1;
int *rootEnd = NULL;
int *splitEnd = NULL;
int size = -1; //Length of input string

Node *newNode(int start, int *end)
{
	Node *node =(Node*) malloc(sizeof(Node));
	int i;
	for (i = 0; i < MAX_CHAR; i++)
		node->children[i] = NULL;

	/*For root node, suffixLink will be set to NULL
	For internal nodes, suffixLink will be set to root
	by default in current extension and may change in
	next extension*/
	node->suffixLink = root;
	node->start = start;
	node->end = end;

	/*suffixIndex will be set to -1 by default and
	actual suffix index will be set later for leaves
	at the end of all phases*/
	node->suffixIndex = -1;
	return node;
}

int edgeLength(Node *n) {
	if(n == root)
		return 0;
	return *(n->end) - (n->start) + 1;
}

int walkDown(Node *currNode)
{
	/*activePoint change for walk down (APCFWD) using
	Skip/Count Trick (Trick 1). If activeLength is greater
	than current edge length, set next internal node as
	activeNode and adjust activeEdge and activeLength
	accordingly to represent same activePoint*/
	if (activeLength >= edgeLength(currNode))
	{
		activeEdge += edgeLength(currNode);
		activeLength -= edgeLength(currNode);
		activeNode = currNode;
		return 1;
	}
	return 0;
}

void extendSuffixTree(int pos)
{
	/*Extension Rule 1, this takes care of extending all
	leaves created so far in tree*/
	leafEnd = pos;

	/*Increment remainingSuffixCount indicating that a
	new suffix added to the list of suffixes yet to be
	added in tree*/
	remainingSuffixCount++;

	/*set lastNewNode to NULL while starting a new phase,
	indicating there is no internal node waiting for
	it's suffix link reset in current phase*/
	lastNewNode = NULL;

	//Add all suffixes (yet to be added) one by one in tree
	while(remainingSuffixCount > 0) {

		if (activeLength == 0)
			activeEdge = pos; //APCFALZ

		// There is no outgoing edge starting with
		// activeEdge from activeNode
		if (activeNode->children] == NULL)
		{
			//Extension Rule 2 (A new leaf edge gets created)
			activeNode->children] =
										newNode(pos, &leafEnd);

			/*A new leaf edge is created in above line starting
			from an existing node (the current activeNode), and
			if there is any internal node waiting for it's suffix
			link get reset, point the suffix link from that last
			internal node to current activeNode. Then set lastNewNode
			to NULL indicating no more node waiting for suffix link
			reset.*/
			if (lastNewNode != NULL)
			{
				lastNewNode->suffixLink = activeNode;
				lastNewNode = NULL;
			}
		}
		// There is an outgoing edge starting with activeEdge
		// from activeNode
		else
		{
			// Get the next node at the end of edge starting
			// with activeEdge
			Node *next = activeNode->children];
			if (walkDown(next))//Do walkdown
			{
				//Start from next node (the new activeNode)
				continue;
			}
			/*Extension Rule 3 (current character being processed
			is already on the edge)*/
			if (text[next->start + activeLength] == text[pos])
			{
				//If a newly created node waiting for it's
				//suffix link to be set, then set suffix link
				//of that waiting node to current active node
				if(lastNewNode != NULL && activeNode != root)
				{
					lastNewNode->suffixLink = activeNode;
					lastNewNode = NULL;
				}

				//APCFER3
				activeLength++;
				/*STOP all further processing in this phase
				and move on to next phase*/
				break;
			}

			/*We will be here when activePoint is in middle of
			the edge being traversed and current character
			being processed is not on the edge (we fall off
			the tree). In this case, we add a new internal node
			and a new leaf edge going out of that new node. This
			is Extension Rule 2, where a new leaf edge and a new
			internal node get created*/
			splitEnd = (int*) malloc(sizeof(int));
			*splitEnd = next->start + activeLength - 1;

			//New internal node
			Node *split = newNode(next->start, splitEnd);
			activeNode->children] = split;

			//New leaf coming out of new internal node
			split->children] = newNode(pos, &leafEnd);
			next->start += activeLength;
			split->children] = next;

			/*We got a new internal node here. If there is any
			internal node created in last extensions of same
			phase which is still waiting for it's suffix link
			reset, do it now.*/
			if (lastNewNode != NULL)
			{
			/*suffixLink of lastNewNode points to current newly
			created internal node*/
				lastNewNode->suffixLink = split;
			}

			/*Make the current newly created internal node waiting
			for it's suffix link reset (which is pointing to root
			at present). If we come across any other internal node
			(existing or newly created) in next extension of same
			phase, when a new leaf edge gets added (i.e. when
			Extension Rule 2 applies is any of the next extension
			of same phase) at that point, suffixLink of this node
			will point to that internal node.*/
			lastNewNode = split;
		}

		/* One suffix got added in tree, decrement the count of
		suffixes yet to be added.*/
		remainingSuffixCount--;
		if (activeNode == root && activeLength > 0) //APCFER2C1
		{
			activeLength--;
			activeEdge = pos - remainingSuffixCount + 1;
		}
		else if (activeNode != root) //APCFER2C2
		{
			activeNode = activeNode->suffixLink;
		}
	}
}

void print(int i, int j)
{
	int k;
	for (k=i; k<=j && text[k] != '#'; k++)
		printf("%c", text[k]);
	if(k<=j)
		printf("#");
}

//Print the suffix tree as well along with setting suffix index
//So tree will be printed in DFS manner
//Each edge along with it's suffix index will be printed
void setSuffixIndexByDFS(Node *n, int labelHeight)
{
	if (n == NULL) return;

	if (n->start != -1) //A non-root node
	{
		//Print the label on edge from parent to current node
		print(n->start, *(n->end));
	}
	int leaf = 1;
	int i;
	for (i = 0; i < MAX_CHAR; i++)
	{
		if (n->children[i] != NULL)
		{
			if (leaf == 1 && n->start != -1)
				printf(" [%d]\n", n->suffixIndex);

			//Current node is not a leaf as it has outgoing
			//edges from it.
			leaf = 0;
			setSuffixIndexByDFS(n->children[i], labelHeight +
								edgeLength(n->children[i]));
		}
	}
	if (leaf == 1)
	{
		for(i= n->start; i<= *(n->end); i++)
		{
			if(text[i] == '#') //Trim unwanted characters
			{
				n->end = (int*) malloc(sizeof(int));
				*(n->end) = i;
			}
		}
		n->suffixIndex = size - labelHeight;
		printf(" [%d]\n", n->suffixIndex);
	}
}

void freeSuffixTreeByPostOrder(Node *n)
{
	if (n == NULL)
		return;
	int i;
	for (i = 0; i < MAX_CHAR; i++)
	{
		if (n->children[i] != NULL)
		{
			freeSuffixTreeByPostOrder(n->children[i]);
		}
	}
	if (n->suffixIndex == -1)
		free(n->end);
	free(n);
}

/*Build the suffix tree and print the edge labels along with
suffixIndex. suffixIndex for leaf edges will be >= 0 and
for non-leaf edges will be -1*/
void buildSuffixTree()
{
	size = strlen(text);
	int i;
	rootEnd = (int*) malloc(sizeof(int));
	*rootEnd = - 1;

	/*Root is a special node with start and end indices as -1,
	as it has no parent from where an edge comes to root*/
	root = newNode(-1, rootEnd);

	activeNode = root; //First activeNode will be root
	for (i=0; i<size; i++)
		extendSuffixTree(i);
	int labelHeight = 0;
	setSuffixIndexByDFS(root, labelHeight);

	//Free the dynamically allocated memory
	freeSuffixTreeByPostOrder(root);
}

// driver program to test above functions
int main(int argc, char *argv[])
{
// strcpy(text, "xabxac#abcabxabcd$"); buildSuffixTree();
	strcpy(text, "xabxa#babxba$"); buildSuffixTree();
	return 0;
}

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

C/C++,树算法——Ukkonen的“后缀树“构造算法的源程序 的相关文章

  • 低级挂钩/SetWindowsHookEx lParam 自动重复?

    在这里阅读 Windows PC 上如何实现键盘自动重复 https stackoverflow com questions 876852 how is keyboard auto repeat implemented on a windo
  • 为什么在排序输入上插入到树中比随机输入更快?

    现在我一直听说从随机选择的数据构建二叉搜索树比有序数据更快 这仅仅是因为有序数据需要显式重新平衡以将树高度保持在最低限度 最近我实现了一个不可变的treap http en wikipedia org wiki Treap 一种特殊的二叉搜
  • ptrace和waitpid有什么关系?

    我正在练习使用ptrace但我不太了解它和之间的关系waitpid 这是我的测试程序 int main int argc char argv pid t pid 22092 if ptrace PTRACE ATTACH pid NULL
  • 这种对有效类型规则的使用是否严格遵守?

    C99和C11中的有效类型规则规定 没有声明类型的存储可以用任何类型写入 并且存储非字符类型的值将相应地设置存储的有效类型 抛开 INT MAX 可能小于 123456789 的事实不谈 以下代码对有效类型规则的使用是否严格符合 inclu
  • 弹出 x86 堆栈以访问函数 arg 时出现分段错误

    我正在尝试链接 x86 程序集和 C 我的C程序 extern int plus 10 int include
  • 如何进行Visual Studio格式字典初始化?

    所有 Visual Studio 也包括 2012 不格式化以下内容 messageProcessor new Dictionary
  • 为什么下面的重叠比较总是评估为 true

    我不明白为什么以下代码有警告 指出重叠比较始终评估为真 接下来的语句永远不会被执行 QVariant MainModel data const QModelIndex index int role const if index isVali
  • 公共基类打破了元组的空基类优化

    gcc 4 7 1 对元组进行空基类优化 我认为这是一个非常有用的功能 然而 这似乎有一个意想不到的限制 include
  • 如何在 C# 中创建 PKCS12 .p12 文件?

    这可能是一个n00b问题 但我在这方面确实没有任何经验 我需要创建一个包含 X509 证书和私钥的 p12 捆绑包 我当前有两个对象 X509Certificate2 和包含关键信息的 RSAParameters 对象 如何将它们合并到 p
  • Xcode 新手无法用 C++ 打开文件?

    我一直在我参加的课程中使用 Windows 但我正在尝试运行基本代码来弄清楚如何从 Xcode 上的文件打开 关闭 输入 输出 而我通常在 Visual Studio 上使用的代码不是不知道为什么 谢谢 include
  • 如何检查给定调用站点的重载决策集

    如何检查重载解析集 我在多个调用站点中使用了 4 个相互竞争的函数 在一个调用站点中 我期望调用一个函数 但编译器会选择另一个函数 我不知道为什么 这不是微不足道的 为了了解发生了什么 我正在使用enable if disable if打开
  • 我如何模拟 UserManager 和 RoleManager 进行单元测试

    我模拟了抽象类来测试类的具体方法 如下所示 var mock new Mock
  • Bazel:将编译标志添加到默认 C++ 工具链

    我想向默认的 C 工具链添加一些编译器和链接器标志 以便我构建的所有目标 本地或导入 共享它们 我知道可以定义我自己的工具链 但我不想这样做 因为它非常复杂且容易出错 理想情况下我想要这样的东西 cc toolchain cc defaul
  • 使用 C# 的异步 WebRequest

    您好 我有一个函数 它将 url Get 参数传递到网络服务器上的 php 文件 并等待文件的响应 通常需要 10 20 秒 我想将其放入一个循环中 因为我必须一次将这些 Get 请求发送到大约 5 个不同的 php 文件 但是当我尝试将其
  • C# 中的类和模块有什么用

    有人可以解释一下类和模块之间的区别吗 你什么时候使用其中一种而不是另一种 我正在使用 C 更新 我的意思是相当于 VB 模块的 C 版本 这在很大程度上取决于您所指的 模块 Visual Basic 的模块 C 中没有真正等效的 VB Ne
  • PowerShell 与 MongoDB C# 驱动程序方法不兼容?

    由 C 泛型引起的最新 MongoDB 驱动程序的问题 Cannot find an overload for GetCollection and the argument count 1 我可能可以使用其他没有泛型的 GetCollect
  • 如何在RcppParallel中调用用户定义的函数?

    受到文章的启发http gallery rcpp org articles parallel distance matrix http gallery rcpp org articles parallel distance matrix 我
  • 曲线/路径骨架二值图像处理

    我正在尝试开发一个可以处理图像骨架的路径 曲线的代码 我想要一个来自两点之间骨架的点向量 该代码在添加一些点后结束 我没有找到解决方案 include opencv2 highgui highgui hpp include opencv2
  • 查找文本文件中每行的行大小

    如何计算每行中的字符或数字数量 是否有类似 EOF 的东西更像是行尾 您可以遍历行中的每个字符并不断增加计数器直到行尾 n 遇到 确保以文本模式打开文件 r 而不是二进制模式 rb 否则流不会自动将不同平台的行结束序列转换为 n 人物 这是
  • 使用 List.Contains 方法为 LINQ 构建表达式树

    Problem 我正在重构一些LINQ查询我们的 Web 应用程序中的多个报告 并且我尝试将一些重复的查询谓词移至它们自己的中IQueryable扩展方法 以便我们可以将它们重新用于这些报告以及将来的报告 正如您可能推断的那样 我已经重构了

随机推荐