365 lines
11 KiB
C
365 lines
11 KiB
C
|
|
|
||
|
|
#include "regexp_tonfa.h"
|
||
|
|
#include "error.h"
|
||
|
|
#include "nfa.h"
|
||
|
|
#include "regexp.h"
|
||
|
|
#include "type_boolarray.h"
|
||
|
|
#include "type_dequeue.h"
|
||
|
|
#include <stdbool.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
|
||
|
|
/****************************/
|
||
|
|
/*+ Algorithme de Glushkov +*/
|
||
|
|
/****************************/
|
||
|
|
|
||
|
|
uint reg_countletters(regexp *expr) {
|
||
|
|
|
||
|
|
// cas de base epression vide
|
||
|
|
if (expr == NULL) {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// compte le nombre de lettre dans les sous expressions
|
||
|
|
if (expr->op == CHAR) {
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// on teste les sous expressions
|
||
|
|
return reg_countletters(expr->left) + reg_countletters(expr->right);
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *reg_create_gk_emp(uint size) {
|
||
|
|
glushkov_info *gk = malloc(sizeof(glushkov_info));
|
||
|
|
gk->size = size;
|
||
|
|
// ensemble vide donc epsilon qui est un mot = false
|
||
|
|
gk->epsilon = false;
|
||
|
|
gk->first = create_barray(size);
|
||
|
|
gk->last = create_barray(size);
|
||
|
|
gk->follow = create_barray(size * size);
|
||
|
|
|
||
|
|
return gk;
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *reg_create_gk_eps(uint size) {
|
||
|
|
glushkov_info *gk = malloc(sizeof(glushkov_info));
|
||
|
|
gk->size = size;
|
||
|
|
gk->epsilon = true;
|
||
|
|
gk->first = create_barray(size);
|
||
|
|
gk->last = create_barray(size);
|
||
|
|
gk->follow = create_barray(size * size);
|
||
|
|
return gk;
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *reg_create_gk_let(uint num, uint size) {
|
||
|
|
// allocation de la structure
|
||
|
|
glushkov_info *gk = malloc(sizeof(glushkov_info));
|
||
|
|
// initialisation de la structure
|
||
|
|
gk->size = size;
|
||
|
|
// singleton d'une lettre donc pas de mot vide
|
||
|
|
gk->epsilon = false;
|
||
|
|
|
||
|
|
gk->first = create_barray(size);
|
||
|
|
settrue_barray(gk->first, num);
|
||
|
|
gk->last = create_barray(size);
|
||
|
|
settrue_barray(gk->last, num);
|
||
|
|
gk->follow = create_barray(size * size);
|
||
|
|
|
||
|
|
return gk;
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *reg_gk_star(glushkov_info *info) {
|
||
|
|
// allocation de la structure
|
||
|
|
glushkov_info *gk = malloc(sizeof(glushkov_info));
|
||
|
|
// initialisation de la structure
|
||
|
|
gk->size = info->size;
|
||
|
|
|
||
|
|
// L'étoile de Kleene d'une expression contient toujours le mot vide
|
||
|
|
gk->epsilon = true;
|
||
|
|
|
||
|
|
// tableau first et last de l'etoile de Kleene sont identiques à ce de départ
|
||
|
|
gk->first = copy_barray(info->first);
|
||
|
|
|
||
|
|
gk->last = copy_barray(info->last);
|
||
|
|
|
||
|
|
// Créer le tableau follow pour l'étoile de Kleene à partir de celui de
|
||
|
|
// l'expression de départ
|
||
|
|
gk->follow = copy_barray(info->follow);
|
||
|
|
|
||
|
|
// Ajouter les transitions pour l'étoile de Kleene
|
||
|
|
for (uint i = 0; i < info->size; i++) {
|
||
|
|
// Si la lettre i peut etre à la fin du mots je regarde si elle peut etre a
|
||
|
|
// coté d'une autre lettreen debut de mot
|
||
|
|
if (getval_barray(info->last, i)) {
|
||
|
|
for (uint j = 0; j < info->size; j++) {
|
||
|
|
if (getval_barray(info->first, j)) {
|
||
|
|
settrue_barray(gk->follow, i * info->size + j);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return gk;
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *reg_gk_union(glushkov_info *info_left,
|
||
|
|
glushkov_info *info_right) {
|
||
|
|
// allocation de la structure
|
||
|
|
glushkov_info *gk = malloc(sizeof(glushkov_info));
|
||
|
|
|
||
|
|
// initialisation de la structure
|
||
|
|
gk->size = info_left->size;
|
||
|
|
|
||
|
|
gk->epsilon = false;
|
||
|
|
|
||
|
|
if (info_left->epsilon || info_right->epsilon) {
|
||
|
|
gk->epsilon = true;
|
||
|
|
}
|
||
|
|
gk->first = or_barray(info_left->first, info_right->first);
|
||
|
|
gk->last = or_barray(info_left->last, info_right->last);
|
||
|
|
gk->follow = or_barray(info_left->follow, info_right->follow);
|
||
|
|
|
||
|
|
return gk;
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *reg_gk_concat(glushkov_info *ptr_info_left,
|
||
|
|
glushkov_info *ptr_info_right) {
|
||
|
|
glushkov_info *new_ptr = malloc(sizeof(glushkov_info));
|
||
|
|
if (new_ptr == NULL) {
|
||
|
|
ERROR("reg_gk_union : malloc failed");
|
||
|
|
}
|
||
|
|
new_ptr->size = ptr_info_left->size;
|
||
|
|
new_ptr->epsilon = (ptr_info_left->epsilon && ptr_info_right->epsilon);
|
||
|
|
|
||
|
|
if (ptr_info_left->epsilon) {
|
||
|
|
new_ptr->first = or_barray(ptr_info_left->first, ptr_info_right->first);
|
||
|
|
} else {
|
||
|
|
new_ptr->first = copy_barray(ptr_info_left->first);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (ptr_info_right->epsilon) {
|
||
|
|
new_ptr->last = or_barray(ptr_info_left->last, ptr_info_right->last);
|
||
|
|
} else {
|
||
|
|
new_ptr->last = copy_barray(ptr_info_right->last);
|
||
|
|
}
|
||
|
|
|
||
|
|
new_ptr->follow = or_barray(ptr_info_left->follow, ptr_info_right->follow);
|
||
|
|
|
||
|
|
for (uint l = 0; l < new_ptr->size; l++) {
|
||
|
|
for (uint f = 0; f < new_ptr->size; f++) {
|
||
|
|
if (getval_barray(ptr_info_left->last, l) &&
|
||
|
|
getval_barray(ptr_info_right->first, f)) {
|
||
|
|
settrue_barray(new_ptr->follow, (l * new_ptr->size) + f);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return new_ptr;
|
||
|
|
}
|
||
|
|
|
||
|
|
void reg_gk_delete(glushkov_info *info) {
|
||
|
|
if (info == NULL) {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
delete_barray(info->first);
|
||
|
|
delete_barray(info->last);
|
||
|
|
delete_barray(info->follow);
|
||
|
|
free(info);
|
||
|
|
}
|
||
|
|
|
||
|
|
glushkov_info *gk_indexleaves(regexp *ptr_regexp, uint nbr_letter,
|
||
|
|
uint *ptr_index, char *ptr_map) {
|
||
|
|
switch (ptr_regexp->op) {
|
||
|
|
case EMPTY:
|
||
|
|
return reg_create_gk_emp(nbr_letter);
|
||
|
|
case EPSILON:
|
||
|
|
return reg_create_gk_eps(nbr_letter);
|
||
|
|
case CHAR:
|
||
|
|
ptr_map[*ptr_index] = ptr_regexp->letter;
|
||
|
|
// On renomme la lettre par un entier
|
||
|
|
ptr_regexp->letter = *ptr_index;
|
||
|
|
(*ptr_index)++;
|
||
|
|
return reg_create_gk_let((*ptr_index - 1), nbr_letter);
|
||
|
|
case UNION:
|
||
|
|
glushkov_info *left_union =
|
||
|
|
gk_indexleaves(ptr_regexp->left, nbr_letter, ptr_index, ptr_map);
|
||
|
|
glushkov_info *right_union =
|
||
|
|
gk_indexleaves(ptr_regexp->right, nbr_letter, ptr_index, ptr_map);
|
||
|
|
return reg_gk_union(left_union, right_union);
|
||
|
|
/*
|
||
|
|
return reg_gk_union(gk_indexleaves(ptr_regexp->left, nbr_letter,
|
||
|
|
ptr_index, ptr_map),
|
||
|
|
gk_indexleaves(ptr_regexp->right, nbr_letter,
|
||
|
|
ptr_index, ptr_map));*/
|
||
|
|
case CONCAT:
|
||
|
|
glushkov_info *left_concat =
|
||
|
|
gk_indexleaves(ptr_regexp->left, nbr_letter, ptr_index, ptr_map);
|
||
|
|
glushkov_info *right_concat =
|
||
|
|
gk_indexleaves(ptr_regexp->right, nbr_letter, ptr_index, ptr_map);
|
||
|
|
return reg_gk_concat(left_concat, right_concat);
|
||
|
|
/*
|
||
|
|
return reg_gk_concat(gk_indexleaves(ptr_regexp->left, nbr_letter,
|
||
|
|
ptr_index, ptr_map),
|
||
|
|
gk_indexleaves(ptr_regexp->right, nbr_letter,
|
||
|
|
ptr_index, ptr_map));*/
|
||
|
|
case STAR:
|
||
|
|
return reg_gk_star(
|
||
|
|
gk_indexleaves(ptr_regexp->left, nbr_letter, ptr_index, ptr_map));
|
||
|
|
default:
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
nfa *reg_glushkov(regexp *ptr_regexp) {
|
||
|
|
// Si l'expression régulière n'est pas simple
|
||
|
|
if (!reg_issimple(ptr_regexp)) {
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
// On compte le nombre de lettres dans l'expression régulière
|
||
|
|
uint nb_letters = reg_countletters(ptr_regexp);
|
||
|
|
// On initialise l'index des lettres
|
||
|
|
uint index = 0;
|
||
|
|
// On initialise le tableau de correspondance des lettres
|
||
|
|
char *map = malloc(sizeof(char) * nb_letters);
|
||
|
|
// On l'initialise à NULL
|
||
|
|
for (uint i = 0; i < nb_letters; i++) {
|
||
|
|
map[i] = '\0';
|
||
|
|
}
|
||
|
|
// On calcul l'objet de type glushkov_info
|
||
|
|
glushkov_info *ptr_glu_info =
|
||
|
|
gk_indexleaves(ptr_regexp, nb_letters, &index, map);
|
||
|
|
// On crée un NFA
|
||
|
|
nfa *ptr_nfa = malloc(sizeof(nfa));
|
||
|
|
if (ptr_nfa == NULL) {
|
||
|
|
ERROR("reg_glushkov : malloc failed");
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
// On calcul le nombre de sommets
|
||
|
|
uint nb_sommet = ptr_glu_info->size + 1;
|
||
|
|
ptr_nfa->trans = create_lgraph_noedges(nb_sommet, ptr_glu_info->size);
|
||
|
|
// On initialise le tableau edges
|
||
|
|
// On parcourt le tableau de follow
|
||
|
|
for (uint ind_follow = 0;
|
||
|
|
ind_follow < ptr_glu_info->size * ptr_glu_info->size; ind_follow++) {
|
||
|
|
// Si la case est vraie alors les deux sommets indéxés par la case
|
||
|
|
// sont reliés par une arrête indéxée par la deuxième lettre
|
||
|
|
if (getval_barray(ptr_glu_info->follow, ind_follow)) {
|
||
|
|
// On récupère les deux lettres / sommets
|
||
|
|
uint sommet_b = ind_follow % ptr_glu_info->size;
|
||
|
|
uint sommet_a = (ind_follow - sommet_b) / ptr_glu_info->size;
|
||
|
|
// On ajoute une arrête à la liste triée entre les deux sommets
|
||
|
|
insert_dequeue(ptr_nfa->trans->edges[sommet_a][sommet_b], sommet_b);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// En plus tous les états initiaux sont reliés a epsilon
|
||
|
|
// On parcourt les états initiaux et on modifie edges [O][i] pour i une
|
||
|
|
// lettre dans la liste des états initiaux
|
||
|
|
for (uint ind_init = 0; ind_init < ptr_glu_info->size; ind_init++) {
|
||
|
|
if (getval_barray(ptr_glu_info->first, ind_init)) {
|
||
|
|
insert_dequeue(ptr_nfa->trans->edges[nb_sommet - 1][ind_init], ind_init);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// On initialise les états initiaux et finaux
|
||
|
|
ptr_nfa->initials = create_dequeue();
|
||
|
|
// On ajoute le sommet représenté par epsilon aux états initiaux
|
||
|
|
// représenté par le numéro nb_sommet-1
|
||
|
|
insert_dequeue(ptr_nfa->initials, nb_sommet - 1);
|
||
|
|
|
||
|
|
ptr_nfa->finals = create_dequeue();
|
||
|
|
// On ajoute les sommets qui sont dans last aux états finaux + espilon
|
||
|
|
// si est dans l'expression régulière totale On parcourt le tableau last
|
||
|
|
for (uint ind_last = 0; ind_last < ptr_glu_info->size; ind_last++) {
|
||
|
|
// Si la lettre est dans last
|
||
|
|
if (getval_barray(ptr_glu_info->last, ind_last)) {
|
||
|
|
// On ajoute le sommet i aux états finaux
|
||
|
|
insert_dequeue(ptr_nfa->finals, ind_last);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Si epsilon est dans l'expression régulière totale
|
||
|
|
if (ptr_glu_info->epsilon) {
|
||
|
|
// On ajoute le sommet représenté par epsilon aux états finaux
|
||
|
|
insert_dequeue(ptr_nfa->finals, nb_sommet - 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
// On initialise le tableau du noms des lettres
|
||
|
|
ptr_nfa->alpha_names = malloc(ptr_nfa->trans->size_alpha * sizeof(char));
|
||
|
|
// On récupère l'équivalence entre les lettres et leur numéro avec le
|
||
|
|
// tableau map
|
||
|
|
for (uint ind_map = 0; ind_map < nb_letters; ind_map++) {
|
||
|
|
ptr_nfa->alpha_names[ind_map] = map[ind_map];
|
||
|
|
}
|
||
|
|
|
||
|
|
ptr_nfa->state_names = NULL;
|
||
|
|
if (ptr_glu_info != NULL) {
|
||
|
|
reg_gk_delete(ptr_glu_info);
|
||
|
|
}
|
||
|
|
return ptr_nfa;
|
||
|
|
}
|
||
|
|
|
||
|
|
/****************************/
|
||
|
|
/*+ Algorithme de Thompson +*/
|
||
|
|
/****************************/
|
||
|
|
|
||
|
|
nfa *reg_thompson(regexp *expr) {
|
||
|
|
if (expr == NULL) {
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
switch (expr->op) {
|
||
|
|
case EMPTY:
|
||
|
|
return create_emptylang();
|
||
|
|
|
||
|
|
case EPSILON:
|
||
|
|
return create_sing_epsilon();
|
||
|
|
|
||
|
|
case CHAR:
|
||
|
|
return create_sing_letter(expr->letter);
|
||
|
|
|
||
|
|
case UNION: {
|
||
|
|
nfa *left_nfa = reg_thompson(expr->left);
|
||
|
|
nfa *right_nfa = reg_thompson(expr->right);
|
||
|
|
nfa *result_nfa = nfa_union(left_nfa, right_nfa);
|
||
|
|
nfa_delete(left_nfa);
|
||
|
|
nfa_delete(right_nfa);
|
||
|
|
return result_nfa;
|
||
|
|
}
|
||
|
|
|
||
|
|
case CONCAT: {
|
||
|
|
nfa *left_nfa = reg_thompson(expr->left);
|
||
|
|
nfa *right_nfa = reg_thompson(expr->right);
|
||
|
|
nfa *result_nfa = nfa_concat(left_nfa, right_nfa);
|
||
|
|
nfa_delete(left_nfa);
|
||
|
|
nfa_delete(right_nfa);
|
||
|
|
return result_nfa;
|
||
|
|
}
|
||
|
|
|
||
|
|
case STAR: {
|
||
|
|
nfa *sub_nfa = reg_thompson(expr->left);
|
||
|
|
nfa *result_nfa = nfa_star(sub_nfa);
|
||
|
|
nfa_delete(sub_nfa);
|
||
|
|
return result_nfa;
|
||
|
|
}
|
||
|
|
|
||
|
|
case INTER: {
|
||
|
|
nfa *left_nfa = reg_thompson(expr->left);
|
||
|
|
nfa *right_nfa = reg_thompson(expr->right);
|
||
|
|
nfa *result_nfa = nfa_intersect(left_nfa, right_nfa, false);
|
||
|
|
nfa_delete(left_nfa);
|
||
|
|
nfa_delete(right_nfa);
|
||
|
|
return result_nfa;
|
||
|
|
}
|
||
|
|
|
||
|
|
case COMPLEMENT: {
|
||
|
|
nfa *sub_nfa = reg_thompson(expr->left);
|
||
|
|
nfa *det_nfa = nfa_determinize(sub_nfa, false);
|
||
|
|
nfa *comp_nfa = nfa_mirror(det_nfa);
|
||
|
|
nfa_delete(sub_nfa);
|
||
|
|
nfa_delete(det_nfa);
|
||
|
|
return comp_nfa;
|
||
|
|
}
|
||
|
|
|
||
|
|
default:
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
}
|