regex_internal.c 95 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632
  1. /*
  2. This file is part of GNUnet
  3. (C) 2012 Christian Grothoff (and other contributing authors)
  4. GNUnet is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published
  6. by the Free Software Foundation; either version 3, or (at your
  7. option) any later version.
  8. GNUnet is distributed in the hope that it will be useful, but
  9. WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with GNUnet; see the file COPYING. If not, write to the
  14. Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  15. Boston, MA 02111-1307, USA.
  16. */
  17. /**
  18. * @file src/regex/regex_internal.c
  19. * @brief library to create Deterministic Finite Automatons (DFAs) from regular
  20. * expressions (regexes).
  21. * @author Maximilian Szengel
  22. */
  23. #include "platform.h"
  24. #include "gnunet_util_lib.h"
  25. #include "gnunet_regex_service.h"
  26. #include "regex_internal_lib.h"
  27. #include "regex_internal.h"
  28. /**
  29. * Set this to GNUNET_YES to enable state naming. Used to debug NFA->DFA
  30. * creation. Disabled by default for better performance.
  31. */
  32. #define REGEX_DEBUG_DFA GNUNET_NO
  33. /**
  34. * Set of states using MDLL API.
  35. */
  36. struct REGEX_INTERNAL_StateSet_MDLL
  37. {
  38. /**
  39. * MDLL of states.
  40. */
  41. struct REGEX_INTERNAL_State *head;
  42. /**
  43. * MDLL of states.
  44. */
  45. struct REGEX_INTERNAL_State *tail;
  46. /**
  47. * Length of the MDLL.
  48. */
  49. unsigned int len;
  50. };
  51. /**
  52. * Append state to the given StateSet '
  53. *
  54. * @param set set to be modified
  55. * @param state state to be appended
  56. */
  57. static void
  58. state_set_append (struct REGEX_INTERNAL_StateSet *set,
  59. struct REGEX_INTERNAL_State *state)
  60. {
  61. if (set->off == set->size)
  62. GNUNET_array_grow (set->states, set->size, set->size * 2 + 4);
  63. set->states[set->off++] = state;
  64. }
  65. /**
  66. * Compare two strings for equality. If either is NULL they are not equal.
  67. *
  68. * @param str1 first string for comparison.
  69. * @param str2 second string for comparison.
  70. *
  71. * @return 0 if the strings are the same or both NULL, 1 or -1 if not.
  72. */
  73. static int
  74. nullstrcmp (const char *str1, const char *str2)
  75. {
  76. if ((NULL == str1) != (NULL == str2))
  77. return -1;
  78. if ((NULL == str1) && (NULL == str2))
  79. return 0;
  80. return strcmp (str1, str2);
  81. }
  82. /**
  83. * Adds a transition from one state to another on 'label'. Does not add
  84. * duplicate states.
  85. *
  86. * @param ctx context
  87. * @param from_state starting state for the transition
  88. * @param label transition label
  89. * @param to_state state to where the transition should point to
  90. */
  91. static void
  92. state_add_transition (struct REGEX_INTERNAL_Context *ctx,
  93. struct REGEX_INTERNAL_State *from_state, const char *label,
  94. struct REGEX_INTERNAL_State *to_state)
  95. {
  96. struct REGEX_INTERNAL_Transition *t;
  97. struct REGEX_INTERNAL_Transition *oth;
  98. if (NULL == from_state)
  99. {
  100. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not create Transition.\n");
  101. return;
  102. }
  103. /* Do not add duplicate state transitions */
  104. for (t = from_state->transitions_head; NULL != t; t = t->next)
  105. {
  106. if (t->to_state == to_state && 0 == nullstrcmp (t->label, label) &&
  107. t->from_state == from_state)
  108. return;
  109. }
  110. /* sort transitions by label */
  111. for (oth = from_state->transitions_head; NULL != oth; oth = oth->next)
  112. {
  113. if (0 < nullstrcmp (oth->label, label))
  114. break;
  115. }
  116. t = GNUNET_new (struct REGEX_INTERNAL_Transition);
  117. if (NULL != ctx)
  118. t->id = ctx->transition_id++;
  119. if (NULL != label)
  120. t->label = GNUNET_strdup (label);
  121. else
  122. t->label = NULL;
  123. t->to_state = to_state;
  124. t->from_state = from_state;
  125. /* Add outgoing transition to 'from_state' */
  126. from_state->transition_count++;
  127. GNUNET_CONTAINER_DLL_insert_before (from_state->transitions_head,
  128. from_state->transitions_tail, oth, t);
  129. }
  130. /**
  131. * Remove a 'transition' from 'state'.
  132. *
  133. * @param state state from which the to-be-removed transition originates.
  134. * @param transition transition that should be removed from state 'state'.
  135. */
  136. static void
  137. state_remove_transition (struct REGEX_INTERNAL_State *state,
  138. struct REGEX_INTERNAL_Transition *transition)
  139. {
  140. if (NULL == state || NULL == transition)
  141. return;
  142. if (transition->from_state != state)
  143. return;
  144. GNUNET_free_non_null (transition->label);
  145. state->transition_count--;
  146. GNUNET_CONTAINER_DLL_remove (state->transitions_head, state->transitions_tail,
  147. transition);
  148. GNUNET_free (transition);
  149. }
  150. /**
  151. * Compare two states. Used for sorting.
  152. *
  153. * @param a first state
  154. * @param b second state
  155. *
  156. * @return an integer less than, equal to, or greater than zero
  157. * if the first argument is considered to be respectively
  158. * less than, equal to, or greater than the second.
  159. */
  160. static int
  161. state_compare (const void *a, const void *b)
  162. {
  163. struct REGEX_INTERNAL_State **s1 = (struct REGEX_INTERNAL_State **) a;
  164. struct REGEX_INTERNAL_State **s2 = (struct REGEX_INTERNAL_State **) b;
  165. return (*s1)->id - (*s2)->id;
  166. }
  167. /**
  168. * Get all edges leaving state 's'.
  169. *
  170. * @param s state.
  171. * @param edges all edges leaving 's', expected to be allocated and have enough
  172. * space for s->transitions_count elements.
  173. *
  174. * @return number of edges.
  175. */
  176. static unsigned int
  177. state_get_edges (struct REGEX_INTERNAL_State *s, struct REGEX_BLOCK_Edge *edges)
  178. {
  179. struct REGEX_INTERNAL_Transition *t;
  180. unsigned int count;
  181. if (NULL == s)
  182. return 0;
  183. count = 0;
  184. for (t = s->transitions_head; NULL != t; t = t->next)
  185. {
  186. if (NULL != t->to_state)
  187. {
  188. edges[count].label = t->label;
  189. edges[count].destination = t->to_state->hash;
  190. count++;
  191. }
  192. }
  193. return count;
  194. }
  195. /**
  196. * Compare to state sets by comparing the id's of the states that are contained
  197. * in each set. Both sets are expected to be sorted by id!
  198. *
  199. * @param sset1 first state set
  200. * @param sset2 second state set
  201. * @return 0 if the sets are equal, otherwise non-zero
  202. */
  203. static int
  204. state_set_compare (struct REGEX_INTERNAL_StateSet *sset1,
  205. struct REGEX_INTERNAL_StateSet *sset2)
  206. {
  207. int result;
  208. unsigned int i;
  209. if (NULL == sset1 || NULL == sset2)
  210. return 1;
  211. result = sset1->off - sset2->off;
  212. if (result < 0)
  213. return -1;
  214. if (result > 0)
  215. return 1;
  216. for (i = 0; i < sset1->off; i++)
  217. if (0 != (result = state_compare (&sset1->states[i], &sset2->states[i])))
  218. break;
  219. return result;
  220. }
  221. /**
  222. * Clears the given StateSet 'set'
  223. *
  224. * @param set set to be cleared
  225. */
  226. static void
  227. state_set_clear (struct REGEX_INTERNAL_StateSet *set)
  228. {
  229. GNUNET_array_grow (set->states, set->size, 0);
  230. set->off = 0;
  231. }
  232. /**
  233. * Clears an automaton fragment. Does not destroy the states inside the
  234. * automaton.
  235. *
  236. * @param a automaton to be cleared
  237. */
  238. static void
  239. automaton_fragment_clear (struct REGEX_INTERNAL_Automaton *a)
  240. {
  241. if (NULL == a)
  242. return;
  243. a->start = NULL;
  244. a->end = NULL;
  245. a->states_head = NULL;
  246. a->states_tail = NULL;
  247. a->state_count = 0;
  248. GNUNET_free (a);
  249. }
  250. /**
  251. * Frees the memory used by State 's'
  252. *
  253. * @param s state that should be destroyed
  254. */
  255. static void
  256. automaton_destroy_state (struct REGEX_INTERNAL_State *s)
  257. {
  258. struct REGEX_INTERNAL_Transition *t;
  259. struct REGEX_INTERNAL_Transition *next_t;
  260. if (NULL == s)
  261. return;
  262. GNUNET_free_non_null (s->name);
  263. GNUNET_free_non_null (s->proof);
  264. state_set_clear (&s->nfa_set);
  265. for (t = s->transitions_head; NULL != t; t = next_t)
  266. {
  267. next_t = t->next;
  268. state_remove_transition (s, t);
  269. }
  270. GNUNET_free (s);
  271. }
  272. /**
  273. * Remove a state from the given automaton 'a'. Always use this function when
  274. * altering the states of an automaton. Will also remove all transitions leading
  275. * to this state, before destroying it.
  276. *
  277. * @param a automaton
  278. * @param s state to remove
  279. */
  280. static void
  281. automaton_remove_state (struct REGEX_INTERNAL_Automaton *a,
  282. struct REGEX_INTERNAL_State *s)
  283. {
  284. struct REGEX_INTERNAL_State *s_check;
  285. struct REGEX_INTERNAL_Transition *t_check;
  286. struct REGEX_INTERNAL_Transition *t_check_next;
  287. if (NULL == a || NULL == s)
  288. return;
  289. /* remove all transitions leading to this state */
  290. for (s_check = a->states_head; NULL != s_check; s_check = s_check->next)
  291. {
  292. for (t_check = s_check->transitions_head; NULL != t_check;
  293. t_check = t_check_next)
  294. {
  295. t_check_next = t_check->next;
  296. if (t_check->to_state == s)
  297. state_remove_transition (s_check, t_check);
  298. }
  299. }
  300. /* remove state */
  301. GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s);
  302. a->state_count--;
  303. automaton_destroy_state (s);
  304. }
  305. /**
  306. * Merge two states into one. Will merge 's1' and 's2' into 's1' and destroy
  307. * 's2'. 's1' will contain all (non-duplicate) outgoing transitions of 's2'.
  308. *
  309. * @param ctx context
  310. * @param a automaton
  311. * @param s1 first state
  312. * @param s2 second state, will be destroyed
  313. */
  314. static void
  315. automaton_merge_states (struct REGEX_INTERNAL_Context *ctx,
  316. struct REGEX_INTERNAL_Automaton *a,
  317. struct REGEX_INTERNAL_State *s1,
  318. struct REGEX_INTERNAL_State *s2)
  319. {
  320. struct REGEX_INTERNAL_State *s_check;
  321. struct REGEX_INTERNAL_Transition *t_check;
  322. struct REGEX_INTERNAL_Transition *t;
  323. struct REGEX_INTERNAL_Transition *t_next;
  324. int is_dup;
  325. if (s1 == s2)
  326. return;
  327. /* 1. Make all transitions pointing to s2 point to s1, unless this transition
  328. * does not already exists, if it already exists remove transition. */
  329. for (s_check = a->states_head; NULL != s_check; s_check = s_check->next)
  330. {
  331. for (t_check = s_check->transitions_head; NULL != t_check; t_check = t_next)
  332. {
  333. t_next = t_check->next;
  334. if (s2 == t_check->to_state)
  335. {
  336. is_dup = GNUNET_NO;
  337. for (t = t_check->from_state->transitions_head; NULL != t; t = t->next)
  338. {
  339. if (t->to_state == s1 && 0 == strcmp (t_check->label, t->label))
  340. is_dup = GNUNET_YES;
  341. }
  342. if (GNUNET_NO == is_dup)
  343. t_check->to_state = s1;
  344. else
  345. state_remove_transition (t_check->from_state, t_check);
  346. }
  347. }
  348. }
  349. /* 2. Add all transitions from s2 to sX to s1 */
  350. for (t_check = s2->transitions_head; NULL != t_check; t_check = t_check->next)
  351. {
  352. if (t_check->to_state != s1)
  353. state_add_transition (ctx, s1, t_check->label, t_check->to_state);
  354. }
  355. /* 3. Rename s1 to {s1,s2} */
  356. #if REGEX_DEBUG_DFA
  357. char *new_name;
  358. new_name = s1->name;
  359. GNUNET_asprintf (&s1->name, "{%s,%s}", new_name, s2->name);
  360. GNUNET_free (new_name);
  361. #endif
  362. /* remove state */
  363. GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s2);
  364. a->state_count--;
  365. automaton_destroy_state (s2);
  366. }
  367. /**
  368. * Add a state to the automaton 'a', always use this function to alter the
  369. * states DLL of the automaton.
  370. *
  371. * @param a automaton to add the state to
  372. * @param s state that should be added
  373. */
  374. static void
  375. automaton_add_state (struct REGEX_INTERNAL_Automaton *a,
  376. struct REGEX_INTERNAL_State *s)
  377. {
  378. GNUNET_CONTAINER_DLL_insert (a->states_head, a->states_tail, s);
  379. a->state_count++;
  380. }
  381. /**
  382. * Depth-first traversal (DFS) of all states that are reachable from state
  383. * 's'. Performs 'action' on each visited state.
  384. *
  385. * @param s start state.
  386. * @param marks an array of size a->state_count to remember which state was
  387. * already visited.
  388. * @param count current count of the state.
  389. * @param check function that is checked before advancing on each transition
  390. * in the DFS.
  391. * @param check_cls closure for check.
  392. * @param action action to be performed on each state.
  393. * @param action_cls closure for action.
  394. */
  395. static void
  396. automaton_state_traverse (struct REGEX_INTERNAL_State *s, int *marks,
  397. unsigned int *count,
  398. REGEX_INTERNAL_traverse_check check, void *check_cls,
  399. REGEX_INTERNAL_traverse_action action, void *action_cls)
  400. {
  401. struct REGEX_INTERNAL_Transition *t;
  402. if (GNUNET_YES == marks[s->traversal_id])
  403. return;
  404. marks[s->traversal_id] = GNUNET_YES;
  405. if (NULL != action)
  406. action (action_cls, *count, s);
  407. (*count)++;
  408. for (t = s->transitions_head; NULL != t; t = t->next)
  409. {
  410. if (NULL == check ||
  411. (NULL != check && GNUNET_YES == check (check_cls, s, t)))
  412. {
  413. automaton_state_traverse (t->to_state, marks, count, check, check_cls,
  414. action, action_cls);
  415. }
  416. }
  417. }
  418. /**
  419. * Traverses the given automaton using depth-first-search (DFS) from it's start
  420. * state, visiting all reachable states and calling 'action' on each one of
  421. * them.
  422. *
  423. * @param a automaton to be traversed.
  424. * @param start start state, pass a->start or NULL to traverse the whole automaton.
  425. * @param check function that is checked before advancing on each transition
  426. * in the DFS.
  427. * @param check_cls closure for check.
  428. * @param action action to be performed on each state.
  429. * @param action_cls closure for action
  430. */
  431. void
  432. REGEX_INTERNAL_automaton_traverse (const struct REGEX_INTERNAL_Automaton *a,
  433. struct REGEX_INTERNAL_State *start,
  434. REGEX_INTERNAL_traverse_check check,
  435. void *check_cls,
  436. REGEX_INTERNAL_traverse_action action,
  437. void *action_cls)
  438. {
  439. unsigned int count;
  440. struct REGEX_INTERNAL_State *s;
  441. if (NULL == a || 0 == a->state_count)
  442. return;
  443. int marks[a->state_count];
  444. for (count = 0, s = a->states_head; NULL != s && count < a->state_count;
  445. s = s->next, count++)
  446. {
  447. s->traversal_id = count;
  448. marks[s->traversal_id] = GNUNET_NO;
  449. }
  450. count = 0;
  451. if (NULL == start)
  452. s = a->start;
  453. else
  454. s = start;
  455. automaton_state_traverse (s, marks, &count, check, check_cls, action,
  456. action_cls);
  457. }
  458. /**
  459. * String container for faster string operations.
  460. */
  461. struct StringBuffer
  462. {
  463. /**
  464. * Buffer holding the string (may start in the middle!);
  465. * NOT 0-terminated!
  466. */
  467. char *sbuf;
  468. /**
  469. * Allocated buffer.
  470. */
  471. char *abuf;
  472. /**
  473. * Length of the string in the buffer.
  474. */
  475. size_t slen;
  476. /**
  477. * Number of bytes allocated for 'sbuf'
  478. */
  479. unsigned int blen;
  480. /**
  481. * Buffer currently represents "NULL" (not the empty string!)
  482. */
  483. int16_t null_flag;
  484. /**
  485. * If this entry is part of the last/current generation array,
  486. * this flag is GNUNET_YES if the last and current generation are
  487. * identical (and thus copying is unnecessary if the value didn't
  488. * change). This is used in an optimization that improves
  489. * performance by about 1% --- if we use int16_t here. With just
  490. * "int" for both flags, performance drops (on my system) significantly,
  491. * most likely due to increased cache misses.
  492. */
  493. int16_t synced;
  494. };
  495. /**
  496. * Compare two strings for equality. If either is NULL they are not equal.
  497. *
  498. * @param s1 first string for comparison.
  499. * @param s2 second string for comparison.
  500. *
  501. * @return 0 if the strings are the same or both NULL, 1 or -1 if not.
  502. */
  503. static int
  504. sb_nullstrcmp (const struct StringBuffer *s1,
  505. const struct StringBuffer *s2)
  506. {
  507. if ( (GNUNET_YES == s1->null_flag) &&
  508. (GNUNET_YES == s2->null_flag) )
  509. return 0;
  510. if ( (GNUNET_YES == s1->null_flag) ||
  511. (GNUNET_YES == s2->null_flag) )
  512. return -1;
  513. if (s1->slen != s2->slen)
  514. return -1;
  515. return memcmp (s1->sbuf, s2->sbuf, s1->slen);
  516. }
  517. /**
  518. * Compare two strings for equality.
  519. *
  520. * @param s1 first string for comparison.
  521. * @param s2 second string for comparison.
  522. *
  523. * @return 0 if the strings are the same, 1 or -1 if not.
  524. */
  525. static int
  526. sb_strcmp (const struct StringBuffer *s1,
  527. const struct StringBuffer *s2)
  528. {
  529. if (s1->slen != s2->slen)
  530. return -1;
  531. return memcmp (s1->sbuf, s2->sbuf, s1->slen);
  532. }
  533. /**
  534. * Reallocate the buffer of 'ret' to fit 'nlen' characters;
  535. * move the existing string to the beginning of the new buffer.
  536. *
  537. * @param ret current buffer, to be updated
  538. * @param nlen target length for the buffer, must be at least ret->slen
  539. */
  540. static void
  541. sb_realloc (struct StringBuffer *ret,
  542. size_t nlen)
  543. {
  544. char *old;
  545. GNUNET_assert (nlen >= ret->slen);
  546. old = ret->abuf;
  547. ret->abuf = GNUNET_malloc (nlen);
  548. ret->blen = nlen;
  549. memcpy (ret->abuf,
  550. ret->sbuf,
  551. ret->slen);
  552. ret->sbuf = ret->abuf;
  553. GNUNET_free_non_null (old);
  554. }
  555. /**
  556. * Append a string.
  557. *
  558. * @param ret where to write the result
  559. * @param sarg string to append
  560. */
  561. static void
  562. sb_append (struct StringBuffer *ret,
  563. const struct StringBuffer *sarg)
  564. {
  565. if (GNUNET_YES == ret->null_flag)
  566. ret->slen = 0;
  567. ret->null_flag = GNUNET_NO;
  568. if (ret->blen < sarg->slen + ret->slen)
  569. sb_realloc (ret, ret->blen + sarg->slen + 128);
  570. memcpy (&ret->sbuf[ret->slen],
  571. sarg->sbuf,
  572. sarg->slen);
  573. ret->slen += sarg->slen;
  574. }
  575. /**
  576. * Append a C string.
  577. *
  578. * @param ret where to write the result
  579. * @param cstr string to append
  580. */
  581. static void
  582. sb_append_cstr (struct StringBuffer *ret,
  583. const char *cstr)
  584. {
  585. size_t cstr_len = strlen (cstr);
  586. if (GNUNET_YES == ret->null_flag)
  587. ret->slen = 0;
  588. ret->null_flag = GNUNET_NO;
  589. if (ret->blen < cstr_len + ret->slen)
  590. sb_realloc (ret, ret->blen + cstr_len + 128);
  591. memcpy (&ret->sbuf[ret->slen],
  592. cstr,
  593. cstr_len);
  594. ret->slen += cstr_len;
  595. }
  596. /**
  597. * Wrap a string buffer, that is, set ret to the format string
  598. * which contains an "%s" which is to be replaced with the original
  599. * content of 'ret'. Note that optimizing this function is not
  600. * really worth it, it is rarely called.
  601. *
  602. * @param ret where to write the result and take the input for %.*s from
  603. * @param format format string, fprintf-style, with exactly one "%.*s"
  604. * @param extra_chars how long will the result be, in addition to 'sarg' length
  605. */
  606. static void
  607. sb_wrap (struct StringBuffer *ret,
  608. const char *format,
  609. size_t extra_chars)
  610. {
  611. char *temp;
  612. if (GNUNET_YES == ret->null_flag)
  613. ret->slen = 0;
  614. ret->null_flag = GNUNET_NO;
  615. temp = GNUNET_malloc (ret->slen + extra_chars + 1);
  616. GNUNET_snprintf (temp,
  617. ret->slen + extra_chars + 1,
  618. format,
  619. (int) ret->slen,
  620. ret->sbuf);
  621. GNUNET_free_non_null (ret->abuf);
  622. ret->abuf = temp;
  623. ret->sbuf = temp;
  624. ret->blen = ret->slen + extra_chars + 1;
  625. ret->slen = ret->slen + extra_chars;
  626. }
  627. /**
  628. * Format a string buffer. Note that optimizing this function is not
  629. * really worth it, it is rarely called.
  630. *
  631. * @param ret where to write the result
  632. * @param format format string, fprintf-style, with exactly one "%.*s"
  633. * @param extra_chars how long will the result be, in addition to 'sarg' length
  634. * @param sarg string to print into the format
  635. */
  636. static void
  637. sb_printf1 (struct StringBuffer *ret,
  638. const char *format,
  639. size_t extra_chars,
  640. const struct StringBuffer *sarg)
  641. {
  642. if (ret->blen < sarg->slen + extra_chars + 1)
  643. sb_realloc (ret,
  644. sarg->slen + extra_chars + 1);
  645. ret->null_flag = GNUNET_NO;
  646. ret->sbuf = ret->abuf;
  647. ret->slen = sarg->slen + extra_chars;
  648. GNUNET_snprintf (ret->sbuf,
  649. ret->blen,
  650. format,
  651. (int) sarg->slen,
  652. sarg->sbuf);
  653. }
  654. /**
  655. * Format a string buffer.
  656. *
  657. * @param ret where to write the result
  658. * @param format format string, fprintf-style, with exactly two "%.*s"
  659. * @param extra_chars how long will the result be, in addition to 'sarg1/2' length
  660. * @param sarg1 first string to print into the format
  661. * @param sarg2 second string to print into the format
  662. */
  663. static void
  664. sb_printf2 (struct StringBuffer *ret,
  665. const char *format,
  666. size_t extra_chars,
  667. const struct StringBuffer *sarg1,
  668. const struct StringBuffer *sarg2)
  669. {
  670. if (ret->blen < sarg1->slen + sarg2->slen + extra_chars + 1)
  671. sb_realloc (ret,
  672. sarg1->slen + sarg2->slen + extra_chars + 1);
  673. ret->null_flag = GNUNET_NO;
  674. ret->slen = sarg1->slen + sarg2->slen + extra_chars;
  675. ret->sbuf = ret->abuf;
  676. GNUNET_snprintf (ret->sbuf,
  677. ret->blen,
  678. format,
  679. (int) sarg1->slen,
  680. sarg1->sbuf,
  681. (int) sarg2->slen,
  682. sarg2->sbuf);
  683. }
  684. /**
  685. * Format a string buffer. Note that optimizing this function is not
  686. * really worth it, it is rarely called.
  687. *
  688. * @param ret where to write the result
  689. * @param format format string, fprintf-style, with exactly three "%.*s"
  690. * @param extra_chars how long will the result be, in addition to 'sarg1/2/3' length
  691. * @param sarg1 first string to print into the format
  692. * @param sarg2 second string to print into the format
  693. * @param sarg3 third string to print into the format
  694. */
  695. static void
  696. sb_printf3 (struct StringBuffer *ret,
  697. const char *format,
  698. size_t extra_chars,
  699. const struct StringBuffer *sarg1,
  700. const struct StringBuffer *sarg2,
  701. const struct StringBuffer *sarg3)
  702. {
  703. if (ret->blen < sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1)
  704. sb_realloc (ret,
  705. sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1);
  706. ret->null_flag = GNUNET_NO;
  707. ret->slen = sarg1->slen + sarg2->slen + sarg3->slen + extra_chars;
  708. ret->sbuf = ret->abuf;
  709. GNUNET_snprintf (ret->sbuf,
  710. ret->blen,
  711. format,
  712. (int) sarg1->slen,
  713. sarg1->sbuf,
  714. (int) sarg2->slen,
  715. sarg2->sbuf,
  716. (int) sarg3->slen,
  717. sarg3->sbuf);
  718. }
  719. /**
  720. * Free resources of the given string buffer.
  721. *
  722. * @param sb buffer to free (actual pointer is not freed, as they
  723. * should not be individually allocated)
  724. */
  725. static void
  726. sb_free (struct StringBuffer *sb)
  727. {
  728. GNUNET_array_grow (sb->abuf,
  729. sb->blen,
  730. 0);
  731. sb->slen = 0;
  732. sb->sbuf = NULL;
  733. sb->null_flag= GNUNET_YES;
  734. }
  735. /**
  736. * Copy the given string buffer from 'in' to 'out'.
  737. *
  738. * @param in input string
  739. * @param out output string
  740. */
  741. static void
  742. sb_strdup (struct StringBuffer *out,
  743. const struct StringBuffer *in)
  744. {
  745. out->null_flag = in->null_flag;
  746. if (GNUNET_YES == out->null_flag)
  747. return;
  748. if (out->blen < in->slen)
  749. {
  750. GNUNET_array_grow (out->abuf,
  751. out->blen,
  752. in->slen);
  753. }
  754. out->sbuf = out->abuf;
  755. out->slen = in->slen;
  756. memcpy (out->sbuf, in->sbuf, out->slen);
  757. }
  758. /**
  759. * Copy the given string buffer from 'in' to 'out'.
  760. *
  761. * @param cstr input string
  762. * @param out output string
  763. */
  764. static void
  765. sb_strdup_cstr (struct StringBuffer *out,
  766. const char *cstr)
  767. {
  768. if (NULL == cstr)
  769. {
  770. out->null_flag = GNUNET_YES;
  771. return;
  772. }
  773. out->null_flag = GNUNET_NO;
  774. out->slen = strlen (cstr);
  775. if (out->blen < out->slen)
  776. {
  777. GNUNET_array_grow (out->abuf,
  778. out->blen,
  779. out->slen);
  780. }
  781. out->sbuf = out->abuf;
  782. memcpy (out->sbuf, cstr, out->slen);
  783. }
  784. /**
  785. * Check if the given string 'str' needs parentheses around it when
  786. * using it to generate a regex.
  787. *
  788. * @param str string
  789. *
  790. * @return GNUNET_YES if parentheses are needed, GNUNET_NO otherwise
  791. */
  792. static int
  793. needs_parentheses (const struct StringBuffer *str)
  794. {
  795. size_t slen;
  796. const char *op;
  797. const char *cl;
  798. const char *pos;
  799. const char *end;
  800. unsigned int cnt;
  801. if ((GNUNET_YES == str->null_flag) || ((slen = str->slen) < 2))
  802. return GNUNET_NO;
  803. pos = str->sbuf;
  804. if ('(' != pos[0])
  805. return GNUNET_YES;
  806. end = str->sbuf + slen;
  807. cnt = 1;
  808. pos++;
  809. while (cnt > 0)
  810. {
  811. cl = memchr (pos, ')', end - pos);
  812. if (NULL == cl)
  813. {
  814. GNUNET_break (0);
  815. return GNUNET_YES;
  816. }
  817. /* while '(' before ')', count opening parens */
  818. while ( (NULL != (op = memchr (pos, '(', end - pos))) &&
  819. (op < cl) )
  820. {
  821. cnt++;
  822. pos = op + 1;
  823. }
  824. /* got ')' first */
  825. cnt--;
  826. pos = cl + 1;
  827. }
  828. return (*pos == '\0') ? GNUNET_NO : GNUNET_YES;
  829. }
  830. /**
  831. * Remove parentheses surrounding string 'str'.
  832. * Example: "(a)" becomes "a", "(a|b)|(a|c)" stays the same.
  833. * You need to GNUNET_free the returned string.
  834. *
  835. * @param str string, modified to contain a
  836. * @return string without surrounding parentheses, string 'str' if no preceding
  837. * epsilon could be found, NULL if 'str' was NULL
  838. */
  839. static void
  840. remove_parentheses (struct StringBuffer *str)
  841. {
  842. size_t slen;
  843. const char *pos;
  844. const char *end;
  845. const char *sbuf;
  846. const char *op;
  847. const char *cp;
  848. unsigned int cnt;
  849. if (0)
  850. return;
  851. sbuf = str->sbuf;
  852. if ( (GNUNET_YES == str->null_flag) ||
  853. (1 >= (slen = str->slen)) ||
  854. ('(' != str->sbuf[0]) ||
  855. (')' != str->sbuf[slen - 1]) )
  856. return;
  857. cnt = 0;
  858. pos = &sbuf[1];
  859. end = &sbuf[slen - 1];
  860. op = memchr (pos, '(', end - pos);
  861. cp = memchr (pos, ')', end - pos);
  862. while (NULL != cp)
  863. {
  864. while ( (NULL != op) &&
  865. (op < cp) )
  866. {
  867. cnt++;
  868. pos = op + 1;
  869. op = memchr (pos, '(', end - pos);
  870. }
  871. while ( (NULL != cp) &&
  872. ( (NULL == op) ||
  873. (cp < op) ) )
  874. {
  875. if (0 == cnt)
  876. return; /* can't strip parens */
  877. cnt--;
  878. pos = cp + 1;
  879. cp = memchr (pos, ')', end - pos);
  880. }
  881. }
  882. if (0 != cnt)
  883. {
  884. GNUNET_break (0);
  885. return;
  886. }
  887. str->sbuf++;
  888. str->slen -= 2;
  889. }
  890. /**
  891. * Check if the string 'str' starts with an epsilon (empty string).
  892. * Example: "(|a)" is starting with an epsilon.
  893. *
  894. * @param str string to test
  895. *
  896. * @return 0 if str has no epsilon, 1 if str starts with '(|' and ends with ')'
  897. */
  898. static int
  899. has_epsilon (const struct StringBuffer *str)
  900. {
  901. return
  902. (GNUNET_YES != str->null_flag) &&
  903. (0 < str->slen) &&
  904. ('(' == str->sbuf[0]) &&
  905. ('|' == str->sbuf[1]) &&
  906. (')' == str->sbuf[str->slen - 1]);
  907. }
  908. /**
  909. * Remove an epsilon from the string str. Where epsilon is an empty string
  910. * Example: str = "(|a|b|c)", result: "a|b|c"
  911. * The returned string needs to be freed.
  912. *
  913. * @param str original string
  914. * @param ret where to return string without preceding epsilon, string 'str' if no preceding
  915. * epsilon could be found, NULL if 'str' was NULL
  916. */
  917. static void
  918. remove_epsilon (const struct StringBuffer *str,
  919. struct StringBuffer *ret)
  920. {
  921. if (GNUNET_YES == str->null_flag)
  922. {
  923. ret->null_flag = GNUNET_YES;
  924. return;
  925. }
  926. if ( (str->slen > 1) &&
  927. ('(' == str->sbuf[0]) &&
  928. ('|' == str->sbuf[1]) &&
  929. (')' == str->sbuf[str->slen - 1]) )
  930. {
  931. /* remove epsilon */
  932. if (ret->blen < str->slen - 3)
  933. {
  934. GNUNET_array_grow (ret->abuf,
  935. ret->blen,
  936. str->slen - 3);
  937. }
  938. ret->sbuf = ret->abuf;
  939. ret->slen = str->slen - 3;
  940. memcpy (ret->sbuf, &str->sbuf[2], ret->slen);
  941. return;
  942. }
  943. sb_strdup (ret, str);
  944. }
  945. /**
  946. * Compare n bytes of 'str1' and 'str2'
  947. *
  948. * @param str1 first string to compare
  949. * @param str2 second string for comparison
  950. * @param n number of bytes to compare
  951. *
  952. * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
  953. */
  954. static int
  955. sb_strncmp (const struct StringBuffer *str1,
  956. const struct StringBuffer *str2, size_t n)
  957. {
  958. size_t max;
  959. if ( (str1->slen != str2->slen) &&
  960. ( (str1->slen < n) ||
  961. (str2->slen < n) ) )
  962. return -1;
  963. max = GNUNET_MAX (str1->slen, str2->slen);
  964. if (max > n)
  965. max = n;
  966. return memcmp (str1->sbuf, str2->sbuf, max);
  967. }
  968. /**
  969. * Compare n bytes of 'str1' and 'str2'
  970. *
  971. * @param str1 first string to compare
  972. * @param str2 second C string for comparison
  973. * @param n number of bytes to compare (and length of str2)
  974. *
  975. * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
  976. */
  977. static int
  978. sb_strncmp_cstr (const struct StringBuffer *str1,
  979. const char *str2, size_t n)
  980. {
  981. if (str1->slen < n)
  982. return -1;
  983. return memcmp (str1->sbuf, str2, n);
  984. }
  985. /**
  986. * Initialize string buffer for storing strings of up to n
  987. * characters.
  988. *
  989. * @param sb buffer to initialize
  990. * @param n desired target length
  991. */
  992. static void
  993. sb_init (struct StringBuffer *sb,
  994. size_t n)
  995. {
  996. sb->null_flag = GNUNET_NO;
  997. sb->abuf = sb->sbuf = (0 == n) ? NULL : GNUNET_malloc (n);
  998. sb->blen = n;
  999. sb->slen = 0;
  1000. }
  1001. /**
  1002. * Compare 'str1', starting from position 'k', with whole 'str2'
  1003. *
  1004. * @param str1 first string to compare, starting from position 'k'
  1005. * @param str2 second string for comparison
  1006. * @param k starting position in 'str1'
  1007. *
  1008. * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
  1009. */
  1010. static int
  1011. sb_strkcmp (const struct StringBuffer *str1,
  1012. const struct StringBuffer *str2, size_t k)
  1013. {
  1014. if ( (GNUNET_YES == str1->null_flag) ||
  1015. (GNUNET_YES == str2->null_flag) ||
  1016. (k > str1->slen) ||
  1017. (str1->slen - k != str2->slen) )
  1018. return -1;
  1019. return memcmp (&str1->sbuf[k], str2->sbuf, str2->slen);
  1020. }
  1021. /**
  1022. * Helper function used as 'action' in 'REGEX_INTERNAL_automaton_traverse'
  1023. * function to create the depth-first numbering of the states.
  1024. *
  1025. * @param cls states array.
  1026. * @param count current state counter.
  1027. * @param s current state.
  1028. */
  1029. static void
  1030. number_states (void *cls, const unsigned int count,
  1031. struct REGEX_INTERNAL_State *s)
  1032. {
  1033. struct REGEX_INTERNAL_State **states = cls;
  1034. s->dfs_id = count;
  1035. if (NULL != states)
  1036. states[count] = s;
  1037. }
  1038. #define PRIS(a) \
  1039. ((GNUNET_YES == a.null_flag) ? 6 : (int) a.slen), \
  1040. ((GNUNET_YES == a.null_flag) ? "(null)" : a.sbuf)
  1041. /**
  1042. * Construct the regular expression given the inductive step,
  1043. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^*
  1044. * R^{(k-1)}_{kj}, and simplify the resulting expression saved in R_cur_ij.
  1045. *
  1046. * @param R_last_ij value of $R^{(k-1)_{ij}.
  1047. * @param R_last_ik value of $R^{(k-1)_{ik}.
  1048. * @param R_last_kk value of $R^{(k-1)_{kk}.
  1049. * @param R_last_kj value of $R^{(k-1)_{kj}.
  1050. * @param R_cur_ij result for this inductive step is saved in R_cur_ij, R_cur_ij
  1051. * is expected to be NULL when called!
  1052. * @param R_cur_l optimization -- kept between iterations to avoid realloc
  1053. * @param R_cur_r optimization -- kept between iterations to avoid realloc
  1054. */
  1055. static void
  1056. automaton_create_proofs_simplify (const struct StringBuffer *R_last_ij,
  1057. const struct StringBuffer *R_last_ik,
  1058. const struct StringBuffer *R_last_kk,
  1059. const struct StringBuffer *R_last_kj,
  1060. struct StringBuffer *R_cur_ij,
  1061. struct StringBuffer *R_cur_l,
  1062. struct StringBuffer *R_cur_r)
  1063. {
  1064. struct StringBuffer R_temp_ij;
  1065. struct StringBuffer R_temp_ik;
  1066. struct StringBuffer R_temp_kj;
  1067. struct StringBuffer R_temp_kk;
  1068. int eps_check;
  1069. int ij_ik_cmp;
  1070. int ij_kj_cmp;
  1071. int ik_kk_cmp;
  1072. int kk_kj_cmp;
  1073. int clean_ik_kk_cmp;
  1074. int clean_kk_kj_cmp;
  1075. size_t length;
  1076. size_t length_l;
  1077. size_t length_r;
  1078. /*
  1079. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
  1080. * R_last == R^{(k-1)}, R_cur == R^{(k)}
  1081. * R_cur_ij = R_cur_l | R_cur_r
  1082. * R_cur_l == R^{(k-1)}_{ij}
  1083. * R_cur_r == R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
  1084. */
  1085. if ( (GNUNET_YES == R_last_ij->null_flag) &&
  1086. ( (GNUNET_YES == R_last_ik->null_flag) ||
  1087. (GNUNET_YES == R_last_kj->null_flag)))
  1088. {
  1089. /* R^{(k)}_{ij} = N | N */
  1090. R_cur_ij->null_flag = GNUNET_YES;
  1091. R_cur_ij->synced = GNUNET_NO;
  1092. return;
  1093. }
  1094. if ( (GNUNET_YES == R_last_ik->null_flag) ||
  1095. (GNUNET_YES == R_last_kj->null_flag) )
  1096. {
  1097. /* R^{(k)}_{ij} = R^{(k-1)}_{ij} | N */
  1098. if (GNUNET_YES == R_last_ij->synced)
  1099. {
  1100. R_cur_ij->synced = GNUNET_YES;
  1101. R_cur_ij->null_flag = GNUNET_NO;
  1102. return;
  1103. }
  1104. R_cur_ij->synced = GNUNET_YES;
  1105. sb_strdup (R_cur_ij, R_last_ij);
  1106. return;
  1107. }
  1108. R_cur_ij->synced = GNUNET_NO;
  1109. /* $R^{(k)}_{ij} = N | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} OR
  1110. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} */
  1111. R_cur_r->null_flag = GNUNET_YES;
  1112. R_cur_r->slen = 0;
  1113. R_cur_l->null_flag = GNUNET_YES;
  1114. R_cur_l->slen = 0;
  1115. /* cache results from strcmp, we might need these many times */
  1116. ij_kj_cmp = sb_nullstrcmp (R_last_ij, R_last_kj);
  1117. ij_ik_cmp = sb_nullstrcmp (R_last_ij, R_last_ik);
  1118. ik_kk_cmp = sb_nullstrcmp (R_last_ik, R_last_kk);
  1119. kk_kj_cmp = sb_nullstrcmp (R_last_kk, R_last_kj);
  1120. /* Assign R_temp_(ik|kk|kj) to R_last[][] and remove epsilon as well
  1121. * as parentheses, so we can better compare the contents */
  1122. memset (&R_temp_ij, 0, sizeof (struct StringBuffer));
  1123. memset (&R_temp_ik, 0, sizeof (struct StringBuffer));
  1124. memset (&R_temp_kk, 0, sizeof (struct StringBuffer));
  1125. memset (&R_temp_kj, 0, sizeof (struct StringBuffer));
  1126. remove_epsilon (R_last_ik, &R_temp_ik);
  1127. remove_epsilon (R_last_kk, &R_temp_kk);
  1128. remove_epsilon (R_last_kj, &R_temp_kj);
  1129. remove_parentheses (&R_temp_ik);
  1130. remove_parentheses (&R_temp_kk);
  1131. remove_parentheses (&R_temp_kj);
  1132. clean_ik_kk_cmp = sb_nullstrcmp (R_last_ik, &R_temp_kk);
  1133. clean_kk_kj_cmp = sb_nullstrcmp (&R_temp_kk, R_last_kj);
  1134. /* construct R_cur_l (and, if necessary R_cur_r) */
  1135. if (GNUNET_YES != R_last_ij->null_flag)
  1136. {
  1137. /* Assign R_temp_ij to R_last_ij and remove epsilon as well
  1138. * as parentheses, so we can better compare the contents */
  1139. remove_epsilon (R_last_ij, &R_temp_ij);
  1140. remove_parentheses (&R_temp_ij);
  1141. if ( (0 == sb_strcmp (&R_temp_ij, &R_temp_ik)) &&
  1142. (0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) &&
  1143. (0 == sb_strcmp (&R_temp_kk, &R_temp_kj)) )
  1144. {
  1145. if (0 == R_temp_ij.slen)
  1146. {
  1147. R_cur_r->null_flag = GNUNET_NO;
  1148. }
  1149. else if ((0 == sb_strncmp_cstr (R_last_ij, "(|", 2)) ||
  1150. (0 == sb_strncmp_cstr (R_last_ik, "(|", 2) &&
  1151. 0 == sb_strncmp_cstr (R_last_kj, "(|", 2)))
  1152. {
  1153. /*
  1154. * a|(e|a)a*(e|a) = a*
  1155. * a|(e|a)(e|a)*(e|a) = a*
  1156. * (e|a)|aa*a = a*
  1157. * (e|a)|aa*(e|a) = a*
  1158. * (e|a)|(e|a)a*a = a*
  1159. * (e|a)|(e|a)a*(e|a) = a*
  1160. * (e|a)|(e|a)(e|a)*(e|a) = a*
  1161. */
  1162. if (GNUNET_YES == needs_parentheses (&R_temp_ij))
  1163. sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_ij);
  1164. else
  1165. sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_ij);
  1166. }
  1167. else
  1168. {
  1169. /*
  1170. * a|aa*a = a+
  1171. * a|(e|a)a*a = a+
  1172. * a|aa*(e|a) = a+
  1173. * a|(e|a)(e|a)*a = a+
  1174. * a|a(e|a)*(e|a) = a+
  1175. */
  1176. if (GNUNET_YES == needs_parentheses (&R_temp_ij))
  1177. sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_ij);
  1178. else
  1179. sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_ij);
  1180. }
  1181. }
  1182. else if ( (0 == ij_ik_cmp) && (0 == clean_kk_kj_cmp) && (0 != clean_ik_kk_cmp) )
  1183. {
  1184. /* a|ab*b = ab* */
  1185. if (0 == R_last_kk->slen)
  1186. sb_strdup (R_cur_r, R_last_ij);
  1187. else if (GNUNET_YES == needs_parentheses (&R_temp_kk))
  1188. sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk);
  1189. else
  1190. sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, R_last_kk);
  1191. R_cur_l->null_flag = GNUNET_YES;
  1192. }
  1193. else if ( (0 == ij_kj_cmp) && (0 == clean_ik_kk_cmp) && (0 != clean_kk_kj_cmp))
  1194. {
  1195. /* a|bb*a = b*a */
  1196. if (R_last_kk->slen < 1)
  1197. {
  1198. sb_strdup (R_cur_r, R_last_kj);
  1199. }
  1200. else if (GNUNET_YES == needs_parentheses (&R_temp_kk))
  1201. sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj);
  1202. else
  1203. sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj);
  1204. R_cur_l->null_flag = GNUNET_YES;
  1205. }
  1206. else if ( (0 == ij_ik_cmp) && (0 == kk_kj_cmp) && (! has_epsilon (R_last_ij)) &&
  1207. has_epsilon (R_last_kk))
  1208. {
  1209. /* a|a(e|b)*(e|b) = a|ab* = a|a|ab|abb|abbb|... = ab* */
  1210. if (needs_parentheses (&R_temp_kk))
  1211. sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk);
  1212. else
  1213. sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, &R_temp_kk);
  1214. R_cur_l->null_flag = GNUNET_YES;
  1215. }
  1216. else if ( (0 == ij_kj_cmp) && (0 == ik_kk_cmp) && (! has_epsilon (R_last_ij)) &&
  1217. has_epsilon (R_last_kk))
  1218. {
  1219. /* a|(e|b)(e|b)*a = a|b*a = a|a|ba|bba|bbba|... = b*a */
  1220. if (needs_parentheses (&R_temp_kk))
  1221. sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_ij);
  1222. else
  1223. sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_ij);
  1224. R_cur_l->null_flag = GNUNET_YES;
  1225. }
  1226. else
  1227. {
  1228. sb_strdup (R_cur_l, R_last_ij);
  1229. remove_parentheses (R_cur_l);
  1230. }
  1231. }
  1232. else
  1233. {
  1234. /* we have no left side */
  1235. R_cur_l->null_flag = GNUNET_YES;
  1236. }
  1237. /* construct R_cur_r, if not already constructed */
  1238. if (GNUNET_YES == R_cur_r->null_flag)
  1239. {
  1240. length = R_temp_kk.slen - R_last_ik->slen;
  1241. /* a(ba)*bx = (ab)+x */
  1242. if ( (length > 0) &&
  1243. (GNUNET_YES != R_last_kk->null_flag) &&
  1244. (0 < R_last_kk->slen) &&
  1245. (GNUNET_YES != R_last_kj->null_flag) &&
  1246. (0 < R_last_kj->slen) &&
  1247. (GNUNET_YES != R_last_ik->null_flag) &&
  1248. (0 < R_last_ik->slen) &&
  1249. (0 == sb_strkcmp (&R_temp_kk, R_last_ik, length)) &&
  1250. (0 == sb_strncmp (&R_temp_kk, R_last_kj, length)) )
  1251. {
  1252. struct StringBuffer temp_a;
  1253. struct StringBuffer temp_b;
  1254. sb_init (&temp_a, length);
  1255. sb_init (&temp_b, R_last_kj->slen - length);
  1256. length_l = length;
  1257. temp_a.sbuf = temp_a.abuf;
  1258. memcpy (temp_a.sbuf, R_last_kj->sbuf, length_l);
  1259. temp_a.slen = length_l;
  1260. length_r = R_last_kj->slen - length;
  1261. temp_b.sbuf = temp_b.abuf;
  1262. memcpy (temp_b.sbuf, &R_last_kj->sbuf[length], length_r);
  1263. temp_b.slen = length_r;
  1264. /* e|(ab)+ = (ab)* */
  1265. if ( (GNUNET_YES != R_cur_l->null_flag) &&
  1266. (0 == R_cur_l->slen) &&
  1267. (0 == temp_b.slen) )
  1268. {
  1269. sb_printf2 (R_cur_r, "(%.*s%.*s)*", 3, R_last_ik, &temp_a);
  1270. sb_free (R_cur_l);
  1271. R_cur_l->null_flag = GNUNET_YES;
  1272. }
  1273. else
  1274. {
  1275. sb_printf3 (R_cur_r, "(%.*s%.*s)+%.*s", 3, R_last_ik, &temp_a, &temp_b);
  1276. }
  1277. sb_free (&temp_a);
  1278. sb_free (&temp_b);
  1279. }
  1280. else if (0 == sb_strcmp (&R_temp_ik, &R_temp_kk) &&
  1281. 0 == sb_strcmp (&R_temp_kk, &R_temp_kj))
  1282. {
  1283. /*
  1284. * (e|a)a*(e|a) = a*
  1285. * (e|a)(e|a)*(e|a) = a*
  1286. */
  1287. if (has_epsilon (R_last_ik) && has_epsilon (R_last_kj))
  1288. {
  1289. if (needs_parentheses (&R_temp_kk))
  1290. sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_kk);
  1291. else
  1292. sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_kk);
  1293. }
  1294. /* aa*a = a+a */
  1295. else if ( (0 == clean_ik_kk_cmp) &&
  1296. (0 == clean_kk_kj_cmp) &&
  1297. (! has_epsilon (R_last_ik)) )
  1298. {
  1299. if (needs_parentheses (&R_temp_kk))
  1300. sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, &R_temp_kk);
  1301. else
  1302. sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, &R_temp_kk);
  1303. }
  1304. /*
  1305. * (e|a)a*a = a+
  1306. * aa*(e|a) = a+
  1307. * a(e|a)*(e|a) = a+
  1308. * (e|a)a*a = a+
  1309. */
  1310. else
  1311. {
  1312. eps_check =
  1313. (has_epsilon (R_last_ik) + has_epsilon (R_last_kk) +
  1314. has_epsilon (R_last_kj));
  1315. if (1 == eps_check)
  1316. {
  1317. if (needs_parentheses (&R_temp_kk))
  1318. sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_kk);
  1319. else
  1320. sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_kk);
  1321. }
  1322. }
  1323. }
  1324. /*
  1325. * aa*b = a+b
  1326. * (e|a)(e|a)*b = a*b
  1327. */
  1328. else if (0 == sb_strcmp (&R_temp_ik, &R_temp_kk))
  1329. {
  1330. if (has_epsilon (R_last_ik))
  1331. {
  1332. if (needs_parentheses (&R_temp_kk))
  1333. sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj);
  1334. else
  1335. sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj);
  1336. }
  1337. else
  1338. {
  1339. if (needs_parentheses (&R_temp_kk))
  1340. sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, R_last_kj);
  1341. else
  1342. sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, R_last_kj);
  1343. }
  1344. }
  1345. /*
  1346. * ba*a = ba+
  1347. * b(e|a)*(e|a) = ba*
  1348. */
  1349. else if (0 == sb_strcmp (&R_temp_kk, &R_temp_kj))
  1350. {
  1351. if (has_epsilon (R_last_kj))
  1352. {
  1353. if (needs_parentheses (&R_temp_kk))
  1354. sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ik, &R_temp_kk);
  1355. else
  1356. sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ik, &R_temp_kk);
  1357. }
  1358. else
  1359. {
  1360. if (needs_parentheses (&R_temp_kk))
  1361. sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, R_last_ik, &R_temp_kk);
  1362. else
  1363. sb_printf2 (R_cur_r, "%.*s+%.*s", 1, R_last_ik, &R_temp_kk);
  1364. }
  1365. }
  1366. else
  1367. {
  1368. if (0 < R_temp_kk.slen)
  1369. {
  1370. if (needs_parentheses (&R_temp_kk))
  1371. {
  1372. sb_printf3 (R_cur_r, "%.*s(%.*s)*%.*s", 3, R_last_ik, &R_temp_kk,
  1373. R_last_kj);
  1374. }
  1375. else
  1376. {
  1377. sb_printf3 (R_cur_r, "%.*s%.*s*%.*s", 1, R_last_ik, &R_temp_kk,
  1378. R_last_kj);
  1379. }
  1380. }
  1381. else
  1382. {
  1383. sb_printf2 (R_cur_r, "%.*s%.*s", 0, R_last_ik, R_last_kj);
  1384. }
  1385. }
  1386. }
  1387. sb_free (&R_temp_ij);
  1388. sb_free (&R_temp_ik);
  1389. sb_free (&R_temp_kk);
  1390. sb_free (&R_temp_kj);
  1391. if ( (GNUNET_YES == R_cur_l->null_flag) &&
  1392. (GNUNET_YES == R_cur_r->null_flag) )
  1393. {
  1394. R_cur_ij->null_flag = GNUNET_YES;
  1395. return;
  1396. }
  1397. if ( (GNUNET_YES != R_cur_l->null_flag) &&
  1398. (GNUNET_YES == R_cur_r->null_flag) )
  1399. {
  1400. struct StringBuffer tmp;
  1401. tmp = *R_cur_ij;
  1402. *R_cur_ij = *R_cur_l;
  1403. *R_cur_l = tmp;
  1404. return;
  1405. }
  1406. if ( (GNUNET_YES == R_cur_l->null_flag) &&
  1407. (GNUNET_YES != R_cur_r->null_flag) )
  1408. {
  1409. struct StringBuffer tmp;
  1410. tmp = *R_cur_ij;
  1411. *R_cur_ij = *R_cur_r;
  1412. *R_cur_r = tmp;
  1413. return;
  1414. }
  1415. if (0 == sb_nullstrcmp (R_cur_l, R_cur_r))
  1416. {
  1417. struct StringBuffer tmp;
  1418. tmp = *R_cur_ij;
  1419. *R_cur_ij = *R_cur_l;
  1420. *R_cur_l = tmp;
  1421. return;
  1422. }
  1423. sb_printf2 (R_cur_ij, "(%.*s|%.*s)", 3, R_cur_l, R_cur_r);
  1424. }
  1425. /**
  1426. * Create proofs for all states in the given automaton. Implementation of the
  1427. * algorithm descriped in chapter 3.2.1 of "Automata Theory, Languages, and
  1428. * Computation 3rd Edition" by Hopcroft, Motwani and Ullman.
  1429. *
  1430. * Each state in the automaton gets assigned 'proof' and 'hash' (hash of the
  1431. * proof) fields. The starting state will only have a valid proof/hash if it has
  1432. * any incoming transitions.
  1433. *
  1434. * @param a automaton for which to assign proofs and hashes, must not be NULL
  1435. */
  1436. static int
  1437. automaton_create_proofs (struct REGEX_INTERNAL_Automaton *a)
  1438. {
  1439. unsigned int n = a->state_count;
  1440. struct REGEX_INTERNAL_State *states[n];
  1441. struct StringBuffer *R_last;
  1442. struct StringBuffer *R_cur;
  1443. struct StringBuffer R_cur_r;
  1444. struct StringBuffer R_cur_l;
  1445. struct StringBuffer *R_swap;
  1446. struct REGEX_INTERNAL_Transition *t;
  1447. struct StringBuffer complete_regex;
  1448. unsigned int i;
  1449. unsigned int j;
  1450. unsigned int k;
  1451. R_last = GNUNET_malloc_large (sizeof (struct StringBuffer) * n * n);
  1452. R_cur = GNUNET_malloc_large (sizeof (struct StringBuffer) * n * n);
  1453. if ( (NULL == R_last) ||
  1454. (NULL == R_cur) )
  1455. {
  1456. GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc");
  1457. GNUNET_free_non_null (R_cur);
  1458. GNUNET_free_non_null (R_last);
  1459. return GNUNET_SYSERR;
  1460. }
  1461. /* create depth-first numbering of the states, initializes 'state' */
  1462. REGEX_INTERNAL_automaton_traverse (a, a->start, NULL, NULL, &number_states,
  1463. states);
  1464. for (i = 0; i < n; i++)
  1465. GNUNET_assert (NULL != states[i]);
  1466. for (i = 0; i < n; i++)
  1467. for (j = 0; j < n; j++)
  1468. R_last[i *n + j].null_flag = GNUNET_YES;
  1469. /* Compute regular expressions of length "1" between each pair of states */
  1470. for (i = 0; i < n; i++)
  1471. {
  1472. for (t = states[i]->transitions_head; NULL != t; t = t->next)
  1473. {
  1474. j = t->to_state->dfs_id;
  1475. if (GNUNET_YES == R_last[i * n + j].null_flag)
  1476. {
  1477. sb_strdup_cstr (&R_last[i * n + j], t->label);
  1478. }
  1479. else
  1480. {
  1481. sb_append_cstr (&R_last[i * n + j], "|");
  1482. sb_append_cstr (&R_last[i * n + j], t->label);
  1483. }
  1484. }
  1485. /* add self-loop: i is reachable from i via epsilon-transition */
  1486. if (GNUNET_YES == R_last[i * n + i].null_flag)
  1487. {
  1488. R_last[i * n + i].slen = 0;
  1489. R_last[i * n + i].null_flag = GNUNET_NO;
  1490. }
  1491. else
  1492. {
  1493. sb_wrap (&R_last[i * n + i], "(|%.*s)", 3);
  1494. }
  1495. }
  1496. for (i = 0; i < n; i++)
  1497. for (j = 0; j < n; j++)
  1498. if (needs_parentheses (&R_last[i * n + j]))
  1499. sb_wrap (&R_last[i * n + j], "(%.*s)", 2);
  1500. /* Compute regular expressions of length "k" between each pair of states per
  1501. * induction */
  1502. memset (&R_cur_l, 0, sizeof (struct StringBuffer));
  1503. memset (&R_cur_r, 0, sizeof (struct StringBuffer));
  1504. for (k = 0; k < n; k++)
  1505. {
  1506. for (i = 0; i < n; i++)
  1507. {
  1508. for (j = 0; j < n; j++)
  1509. {
  1510. /* Basis for the recursion:
  1511. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
  1512. * R_last == R^{(k-1)}, R_cur == R^{(k)}
  1513. */
  1514. /* Create R_cur[i][j] and simplify the expression */
  1515. automaton_create_proofs_simplify (&R_last[i * n + j], &R_last[i * n + k],
  1516. &R_last[k * n + k], &R_last[k * n + j],
  1517. &R_cur[i * n + j],
  1518. &R_cur_l, &R_cur_r);
  1519. }
  1520. }
  1521. /* set R_last = R_cur */
  1522. R_swap = R_last;
  1523. R_last = R_cur;
  1524. R_cur = R_swap;
  1525. /* clear 'R_cur' for next iteration */
  1526. for (i = 0; i < n; i++)
  1527. for (j = 0; j < n; j++)
  1528. R_cur[i * n + j].null_flag = GNUNET_YES;
  1529. }
  1530. sb_free (&R_cur_l);
  1531. sb_free (&R_cur_r);
  1532. /* assign proofs and hashes */
  1533. for (i = 0; i < n; i++)
  1534. {
  1535. if (GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag)
  1536. {
  1537. states[i]->proof = GNUNET_strndup (R_last[a->start->dfs_id * n + i].sbuf,
  1538. R_last[a->start->dfs_id * n + i].slen);
  1539. GNUNET_CRYPTO_hash (states[i]->proof, strlen (states[i]->proof),
  1540. &states[i]->hash);
  1541. }
  1542. }
  1543. /* complete regex for whole DFA: union of all pairs (start state/accepting
  1544. * state(s)). */
  1545. sb_init (&complete_regex, 16 * n);
  1546. for (i = 0; i < n; i++)
  1547. {
  1548. if (states[i]->accepting)
  1549. {
  1550. if ( (0 == complete_regex.slen) &&
  1551. (0 < R_last[a->start->dfs_id * n + i].slen) )
  1552. {
  1553. sb_append (&complete_regex,
  1554. &R_last[a->start->dfs_id * n + i]);
  1555. }
  1556. else if ( (GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag) &&
  1557. (0 < R_last[a->start->dfs_id * n + i].slen) )
  1558. {
  1559. sb_append_cstr (&complete_regex, "|");
  1560. sb_append (&complete_regex,
  1561. &R_last[a->start->dfs_id * n + i]);
  1562. }
  1563. }
  1564. }
  1565. a->canonical_regex = GNUNET_strndup (complete_regex.sbuf, complete_regex.slen);
  1566. /* cleanup */
  1567. sb_free (&complete_regex);
  1568. for (i = 0; i < n; i++)
  1569. for (j = 0; j < n; j++)
  1570. {
  1571. sb_free (&R_cur[i * n + j]);
  1572. sb_free (&R_last[i * n + j]);
  1573. }
  1574. GNUNET_free (R_cur);
  1575. GNUNET_free (R_last);
  1576. return GNUNET_OK;
  1577. }
  1578. /**
  1579. * Creates a new DFA state based on a set of NFA states. Needs to be freed using
  1580. * automaton_destroy_state.
  1581. *
  1582. * @param ctx context
  1583. * @param nfa_states set of NFA states on which the DFA should be based on
  1584. *
  1585. * @return new DFA state
  1586. */
  1587. static struct REGEX_INTERNAL_State *
  1588. dfa_state_create (struct REGEX_INTERNAL_Context *ctx,
  1589. struct REGEX_INTERNAL_StateSet *nfa_states)
  1590. {
  1591. struct REGEX_INTERNAL_State *s;
  1592. char *pos;
  1593. size_t len;
  1594. struct REGEX_INTERNAL_State *cstate;
  1595. struct REGEX_INTERNAL_Transition *ctran;
  1596. unsigned int i;
  1597. s = GNUNET_new (struct REGEX_INTERNAL_State);
  1598. s->id = ctx->state_id++;
  1599. s->index = -1;
  1600. s->lowlink = -1;
  1601. if (NULL == nfa_states)
  1602. {
  1603. GNUNET_asprintf (&s->name, "s%i", s->id);
  1604. return s;
  1605. }
  1606. s->nfa_set = *nfa_states;
  1607. if (nfa_states->off < 1)
  1608. return s;
  1609. /* Create a name based on 'nfa_states' */
  1610. len = nfa_states->off * 14 + 4;
  1611. s->name = GNUNET_malloc (len);
  1612. strcat (s->name, "{");
  1613. pos = s->name + 1;
  1614. for (i = 0; i < nfa_states->off; i++)
  1615. {
  1616. cstate = nfa_states->states[i];
  1617. GNUNET_snprintf (pos, pos - s->name + len,
  1618. "%i,", cstate->id);
  1619. pos += strlen (pos);
  1620. /* Add a transition for each distinct label to NULL state */
  1621. for (ctran = cstate->transitions_head; NULL != ctran; ctran = ctran->next)
  1622. if (NULL != ctran->label)
  1623. state_add_transition (ctx, s, ctran->label, NULL);
  1624. /* If the nfa_states contain an accepting state, the new dfa state is also
  1625. * accepting. */
  1626. if (cstate->accepting)
  1627. s->accepting = 1;
  1628. }
  1629. pos[-1] = '}';
  1630. s->name = GNUNET_realloc (s->name, strlen (s->name) + 1);
  1631. memset (nfa_states, 0, sizeof (struct REGEX_INTERNAL_StateSet));
  1632. return s;
  1633. }
  1634. /**
  1635. * Move from the given state 's' to the next state on transition 'str'. Consumes
  1636. * as much of the given 'str' as possible (usefull for strided DFAs). On return
  1637. * 's' will point to the next state, and the length of the substring used for
  1638. * this transition will be returned. If no transition possible 0 is returned and
  1639. * 's' points to NULL.
  1640. *
  1641. * @param s starting state, will point to the next state or NULL (if no
  1642. * transition possible)
  1643. * @param str edge label to follow (will match longest common prefix)
  1644. *
  1645. * @return length of the substring comsumed from 'str'
  1646. */
  1647. static unsigned int
  1648. dfa_move (struct REGEX_INTERNAL_State **s, const char *str)
  1649. {
  1650. struct REGEX_INTERNAL_Transition *t;
  1651. struct REGEX_INTERNAL_State *new_s;
  1652. unsigned int len;
  1653. unsigned int max_len;
  1654. if (NULL == s)
  1655. return 0;
  1656. new_s = NULL;
  1657. max_len = 0;
  1658. for (t = (*s)->transitions_head; NULL != t; t = t->next)
  1659. {
  1660. len = strlen (t->label);
  1661. if (0 == strncmp (t->label, str, len))
  1662. {
  1663. if (len >= max_len)
  1664. {
  1665. max_len = len;
  1666. new_s = t->to_state;
  1667. }
  1668. }
  1669. }
  1670. *s = new_s;
  1671. return max_len;
  1672. }
  1673. /**
  1674. * Set the given state 'marked' to GNUNET_YES. Used by the
  1675. * 'dfa_remove_unreachable_states' function to detect unreachable states in the
  1676. * automaton.
  1677. *
  1678. * @param cls closure, not used.
  1679. * @param count count, not used.
  1680. * @param s state where the marked attribute will be set to GNUNET_YES.
  1681. */
  1682. static void
  1683. mark_states (void *cls, const unsigned int count, struct REGEX_INTERNAL_State *s)
  1684. {
  1685. s->marked = GNUNET_YES;
  1686. }
  1687. /**
  1688. * Remove all unreachable states from DFA 'a'. Unreachable states are those
  1689. * states that are not reachable from the starting state.
  1690. *
  1691. * @param a DFA automaton
  1692. */
  1693. static void
  1694. dfa_remove_unreachable_states (struct REGEX_INTERNAL_Automaton *a)
  1695. {
  1696. struct REGEX_INTERNAL_State *s;
  1697. struct REGEX_INTERNAL_State *s_next;
  1698. /* 1. unmark all states */
  1699. for (s = a->states_head; NULL != s; s = s->next)
  1700. s->marked = GNUNET_NO;
  1701. /* 2. traverse dfa from start state and mark all visited states */
  1702. REGEX_INTERNAL_automaton_traverse (a, a->start, NULL, NULL, &mark_states, NULL);
  1703. /* 3. delete all states that were not visited */
  1704. for (s = a->states_head; NULL != s; s = s_next)
  1705. {
  1706. s_next = s->next;
  1707. if (GNUNET_NO == s->marked)
  1708. automaton_remove_state (a, s);
  1709. }
  1710. }
  1711. /**
  1712. * Remove all dead states from the DFA 'a'. Dead states are those states that do
  1713. * not transition to any other state but themselves.
  1714. *
  1715. * @param a DFA automaton
  1716. */
  1717. static void
  1718. dfa_remove_dead_states (struct REGEX_INTERNAL_Automaton *a)
  1719. {
  1720. struct REGEX_INTERNAL_State *s;
  1721. struct REGEX_INTERNAL_State *s_next;
  1722. struct REGEX_INTERNAL_Transition *t;
  1723. int dead;
  1724. GNUNET_assert (DFA == a->type);
  1725. for (s = a->states_head; NULL != s; s = s_next)
  1726. {
  1727. s_next = s->next;
  1728. if (s->accepting)
  1729. continue;
  1730. dead = 1;
  1731. for (t = s->transitions_head; NULL != t; t = t->next)
  1732. {
  1733. if (NULL != t->to_state && t->to_state != s)
  1734. {
  1735. dead = 0;
  1736. break;
  1737. }
  1738. }
  1739. if (0 == dead)
  1740. continue;
  1741. /* state s is dead, remove it */
  1742. automaton_remove_state (a, s);
  1743. }
  1744. }
  1745. /**
  1746. * Merge all non distinguishable states in the DFA 'a'
  1747. *
  1748. * @param ctx context
  1749. * @param a DFA automaton
  1750. * @return GNUNET_OK on success
  1751. */
  1752. static int
  1753. dfa_merge_nondistinguishable_states (struct REGEX_INTERNAL_Context *ctx,
  1754. struct REGEX_INTERNAL_Automaton *a)
  1755. {
  1756. uint32_t *table;
  1757. struct REGEX_INTERNAL_State *s1;
  1758. struct REGEX_INTERNAL_State *s2;
  1759. struct REGEX_INTERNAL_Transition *t1;
  1760. struct REGEX_INTERNAL_Transition *t2;
  1761. struct REGEX_INTERNAL_State *s1_next;
  1762. struct REGEX_INTERNAL_State *s2_next;
  1763. int change;
  1764. unsigned int num_equal_edges;
  1765. unsigned int i;
  1766. unsigned int state_cnt;
  1767. unsigned long long idx;
  1768. unsigned long long idx1;
  1769. if ( (NULL == a) || (0 == a->state_count) )
  1770. {
  1771. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  1772. "Could not merge nondistinguishable states, automaton was NULL.\n");
  1773. return GNUNET_SYSERR;
  1774. }
  1775. state_cnt = a->state_count;
  1776. table = GNUNET_malloc_large ((sizeof (uint32_t) * state_cnt * state_cnt / 32) + sizeof (uint32_t));
  1777. if (NULL == table)
  1778. {
  1779. GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc");
  1780. return GNUNET_SYSERR;
  1781. }
  1782. for (i = 0, s1 = a->states_head; NULL != s1; s1 = s1->next)
  1783. s1->marked = i++;
  1784. /* Mark all pairs of accepting/!accepting states */
  1785. for (s1 = a->states_head; NULL != s1; s1 = s1->next)
  1786. for (s2 = a->states_head; NULL != s2; s2 = s2->next)
  1787. if ( (s1->accepting && !s2->accepting) ||
  1788. (!s1->accepting && s2->accepting) )
  1789. {
  1790. idx = s1->marked * state_cnt + s2->marked;
  1791. table[idx / 32] |= (1 << (idx % 32));
  1792. }
  1793. /* Find all equal states */
  1794. change = 1;
  1795. while (0 != change)
  1796. {
  1797. change = 0;
  1798. for (s1 = a->states_head; NULL != s1; s1 = s1->next)
  1799. {
  1800. for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2->next)
  1801. {
  1802. idx = s1->marked * state_cnt + s2->marked;
  1803. if (0 != (table[idx / 32] & (1 << (idx % 32))))
  1804. continue;
  1805. num_equal_edges = 0;
  1806. for (t1 = s1->transitions_head; NULL != t1; t1 = t1->next)
  1807. {
  1808. for (t2 = s2->transitions_head; NULL != t2; t2 = t2->next)
  1809. {
  1810. if (0 == strcmp (t1->label, t2->label))
  1811. {
  1812. num_equal_edges++;
  1813. /* same edge, but targets definitively different, so we're different
  1814. as well */
  1815. if (t1->to_state->marked > t2->to_state->marked)
  1816. idx1 = t1->to_state->marked * state_cnt + t2->to_state->marked;
  1817. else
  1818. idx1 = t2->to_state->marked * state_cnt + t1->to_state->marked;
  1819. if (0 != (table[idx1 / 32] & (1 << (idx1 % 32))))
  1820. {
  1821. table[idx / 32] |= (1 << (idx % 32));
  1822. change = 1; /* changed a marker, need to run again */
  1823. }
  1824. }
  1825. }
  1826. }
  1827. if ( (num_equal_edges != s1->transition_count) ||
  1828. (num_equal_edges != s2->transition_count) )
  1829. {
  1830. /* Make sure ALL edges of possible equal states are the same */
  1831. table[idx / 32] |= (1 << (idx % 32));
  1832. change = 1; /* changed a marker, need to run again */
  1833. }
  1834. }
  1835. }
  1836. }
  1837. /* Merge states that are equal */
  1838. for (s1 = a->states_head; NULL != s1; s1 = s1_next)
  1839. {
  1840. s1_next = s1->next;
  1841. for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2_next)
  1842. {
  1843. s2_next = s2->next;
  1844. idx = s1->marked * state_cnt + s2->marked;
  1845. if (0 == (table[idx / 32] & (1 << (idx % 32))))
  1846. automaton_merge_states (ctx, a, s1, s2);
  1847. }
  1848. }
  1849. GNUNET_free (table);
  1850. return GNUNET_OK;
  1851. }
  1852. /**
  1853. * Minimize the given DFA 'a' by removing all unreachable states, removing all
  1854. * dead states and merging all non distinguishable states
  1855. *
  1856. * @param ctx context
  1857. * @param a DFA automaton
  1858. * @return GNUNET_OK on success
  1859. */
  1860. static int
  1861. dfa_minimize (struct REGEX_INTERNAL_Context *ctx,
  1862. struct REGEX_INTERNAL_Automaton *a)
  1863. {
  1864. if (NULL == a)
  1865. return GNUNET_SYSERR;
  1866. GNUNET_assert (DFA == a->type);
  1867. /* 1. remove unreachable states */
  1868. dfa_remove_unreachable_states (a);
  1869. /* 2. remove dead states */
  1870. dfa_remove_dead_states (a);
  1871. /* 3. Merge nondistinguishable states */
  1872. if (GNUNET_OK != dfa_merge_nondistinguishable_states (ctx, a))
  1873. return GNUNET_SYSERR;
  1874. return GNUNET_OK;
  1875. }
  1876. /**
  1877. * Context for adding strided transitions to a DFA.
  1878. */
  1879. struct REGEX_INTERNAL_Strided_Context
  1880. {
  1881. /**
  1882. * Length of the strides.
  1883. */
  1884. const unsigned int stride;
  1885. /**
  1886. * Strided transitions DLL. New strided transitions will be stored in this DLL
  1887. * and afterwards added to the DFA.
  1888. */
  1889. struct REGEX_INTERNAL_Transition *transitions_head;
  1890. /**
  1891. * Strided transitions DLL.
  1892. */
  1893. struct REGEX_INTERNAL_Transition *transitions_tail;
  1894. };
  1895. /**
  1896. * Recursive helper function to add strides to a DFA.
  1897. *
  1898. * @param cls context, contains stride length and strided transitions DLL.
  1899. * @param depth current depth of the depth-first traversal of the graph.
  1900. * @param label current label, string that contains all labels on the path from
  1901. * 'start' to 's'.
  1902. * @param start start state for the depth-first traversal of the graph.
  1903. * @param s current state in the depth-first traversal
  1904. */
  1905. static void
  1906. dfa_add_multi_strides_helper (void *cls, const unsigned int depth, char *label,
  1907. struct REGEX_INTERNAL_State *start,
  1908. struct REGEX_INTERNAL_State *s)
  1909. {
  1910. struct REGEX_INTERNAL_Strided_Context *ctx = cls;
  1911. struct REGEX_INTERNAL_Transition *t;
  1912. char *new_label;
  1913. if (depth == ctx->stride)
  1914. {
  1915. t = GNUNET_new (struct REGEX_INTERNAL_Transition);
  1916. t->label = GNUNET_strdup (label);
  1917. t->to_state = s;
  1918. t->from_state = start;
  1919. GNUNET_CONTAINER_DLL_insert (ctx->transitions_head, ctx->transitions_tail,
  1920. t);
  1921. }
  1922. else
  1923. {
  1924. for (t = s->transitions_head; NULL != t; t = t->next)
  1925. {
  1926. /* Do not consider self-loops, because it end's up in too many
  1927. * transitions */
  1928. if (t->to_state == t->from_state)
  1929. continue;
  1930. if (NULL != label)
  1931. {
  1932. GNUNET_asprintf (&new_label, "%s%s", label, t->label);
  1933. }
  1934. else
  1935. new_label = GNUNET_strdup (t->label);
  1936. dfa_add_multi_strides_helper (cls, (depth + 1), new_label, start,
  1937. t->to_state);
  1938. }
  1939. }
  1940. GNUNET_free_non_null (label);
  1941. }
  1942. /**
  1943. * Function called for each state in the DFA. Starts a traversal of depth set in
  1944. * context starting from state 's'.
  1945. *
  1946. * @param cls context.
  1947. * @param count not used.
  1948. * @param s current state.
  1949. */
  1950. static void
  1951. dfa_add_multi_strides (void *cls, const unsigned int count,
  1952. struct REGEX_INTERNAL_State *s)
  1953. {
  1954. dfa_add_multi_strides_helper (cls, 0, NULL, s, s);
  1955. }
  1956. /**
  1957. * Adds multi-strided transitions to the given 'dfa'.
  1958. *
  1959. * @param regex_ctx regex context needed to add transitions to the automaton.
  1960. * @param dfa DFA to which the multi strided transitions should be added.
  1961. * @param stride_len length of the strides.
  1962. */
  1963. void
  1964. REGEX_INTERNAL_dfa_add_multi_strides (struct REGEX_INTERNAL_Context *regex_ctx,
  1965. struct REGEX_INTERNAL_Automaton *dfa,
  1966. const unsigned int stride_len)
  1967. {
  1968. struct REGEX_INTERNAL_Strided_Context ctx = { stride_len, NULL, NULL };
  1969. struct REGEX_INTERNAL_Transition *t;
  1970. struct REGEX_INTERNAL_Transition *t_next;
  1971. if (1 > stride_len || GNUNET_YES == dfa->is_multistrided)
  1972. return;
  1973. /* Compute the new transitions of given stride_len */
  1974. REGEX_INTERNAL_automaton_traverse (dfa, dfa->start, NULL, NULL,
  1975. &dfa_add_multi_strides, &ctx);
  1976. /* Add all the new transitions to the automaton. */
  1977. for (t = ctx.transitions_head; NULL != t; t = t_next)
  1978. {
  1979. t_next = t->next;
  1980. state_add_transition (regex_ctx, t->from_state, t->label, t->to_state);
  1981. GNUNET_CONTAINER_DLL_remove (ctx.transitions_head, ctx.transitions_tail, t);
  1982. GNUNET_free_non_null (t->label);
  1983. GNUNET_free (t);
  1984. }
  1985. /* Mark this automaton as multistrided */
  1986. dfa->is_multistrided = GNUNET_YES;
  1987. }
  1988. /**
  1989. * Recursive Helper function for DFA path compression. Does DFS on the DFA graph
  1990. * and adds new transitions to the given transitions DLL and marks states that
  1991. * should be removed by setting state->contained to GNUNET_YES.
  1992. *
  1993. * @param dfa DFA for which the paths should be compressed.
  1994. * @param start starting state for linear path search.
  1995. * @param cur current state in the recursive DFS.
  1996. * @param label current label (string of traversed labels).
  1997. * @param max_len maximal path compression length.
  1998. * @param transitions_head transitions DLL.
  1999. * @param transitions_tail transitions DLL.
  2000. */
  2001. void
  2002. dfa_compress_paths_helper (struct REGEX_INTERNAL_Automaton *dfa,
  2003. struct REGEX_INTERNAL_State *start,
  2004. struct REGEX_INTERNAL_State *cur, char *label,
  2005. unsigned int max_len,
  2006. struct REGEX_INTERNAL_Transition **transitions_head,
  2007. struct REGEX_INTERNAL_Transition **transitions_tail)
  2008. {
  2009. struct REGEX_INTERNAL_Transition *t;
  2010. char *new_label;
  2011. if (NULL != label &&
  2012. ((cur->incoming_transition_count > 1 || GNUNET_YES == cur->accepting ||
  2013. GNUNET_YES == cur->marked) || (start != dfa->start && max_len > 0 &&
  2014. max_len == strlen (label)) ||
  2015. (start == dfa->start && GNUNET_REGEX_INITIAL_BYTES == strlen (label))))
  2016. {
  2017. t = GNUNET_new (struct REGEX_INTERNAL_Transition);
  2018. t->label = GNUNET_strdup (label);
  2019. t->to_state = cur;
  2020. t->from_state = start;
  2021. GNUNET_CONTAINER_DLL_insert (*transitions_head, *transitions_tail, t);
  2022. if (GNUNET_NO == cur->marked)
  2023. {
  2024. dfa_compress_paths_helper (dfa, cur, cur, NULL, max_len, transitions_head,
  2025. transitions_tail);
  2026. }
  2027. return;
  2028. }
  2029. else if (cur != start)
  2030. cur->contained = GNUNET_YES;
  2031. if (GNUNET_YES == cur->marked && cur != start)
  2032. return;
  2033. cur->marked = GNUNET_YES;
  2034. for (t = cur->transitions_head; NULL != t; t = t->next)
  2035. {
  2036. if (NULL != label)
  2037. GNUNET_asprintf (&new_label, "%s%s", label, t->label);
  2038. else
  2039. new_label = GNUNET_strdup (t->label);
  2040. if (t->to_state != cur)
  2041. {
  2042. dfa_compress_paths_helper (dfa, start, t->to_state, new_label, max_len,
  2043. transitions_head, transitions_tail);
  2044. }
  2045. GNUNET_free (new_label);
  2046. }
  2047. }
  2048. /**
  2049. * Compress paths in the given 'dfa'. Linear paths like 0->1->2->3 will be
  2050. * compressed to 0->3 by combining transitions.
  2051. *
  2052. * @param regex_ctx context for adding new transitions.
  2053. * @param dfa DFA representation, will directly modify the given DFA.
  2054. * @param max_len maximal length of the compressed paths.
  2055. */
  2056. static void
  2057. dfa_compress_paths (struct REGEX_INTERNAL_Context *regex_ctx,
  2058. struct REGEX_INTERNAL_Automaton *dfa, unsigned int max_len)
  2059. {
  2060. struct REGEX_INTERNAL_State *s;
  2061. struct REGEX_INTERNAL_State *s_next;
  2062. struct REGEX_INTERNAL_Transition *t;
  2063. struct REGEX_INTERNAL_Transition *t_next;
  2064. struct REGEX_INTERNAL_Transition *transitions_head = NULL;
  2065. struct REGEX_INTERNAL_Transition *transitions_tail = NULL;
  2066. if (NULL == dfa)
  2067. return;
  2068. /* Count the incoming transitions on each state. */
  2069. for (s = dfa->states_head; NULL != s; s = s->next)
  2070. {
  2071. for (t = s->transitions_head; NULL != t; t = t->next)
  2072. {
  2073. if (NULL != t->to_state)
  2074. t->to_state->incoming_transition_count++;
  2075. }
  2076. }
  2077. /* Unmark all states. */
  2078. for (s = dfa->states_head; NULL != s; s = s->next)
  2079. {
  2080. s->marked = GNUNET_NO;
  2081. s->contained = GNUNET_NO;
  2082. }
  2083. /* Add strides and mark states that can be deleted. */
  2084. dfa_compress_paths_helper (dfa, dfa->start, dfa->start, NULL, max_len,
  2085. &transitions_head, &transitions_tail);
  2086. /* Add all the new transitions to the automaton. */
  2087. for (t = transitions_head; NULL != t; t = t_next)
  2088. {
  2089. t_next = t->next;
  2090. state_add_transition (regex_ctx, t->from_state, t->label, t->to_state);
  2091. GNUNET_CONTAINER_DLL_remove (transitions_head, transitions_tail, t);
  2092. GNUNET_free_non_null (t->label);
  2093. GNUNET_free (t);
  2094. }
  2095. /* Remove marked states (including their incoming and outgoing transitions). */
  2096. for (s = dfa->states_head; NULL != s; s = s_next)
  2097. {
  2098. s_next = s->next;
  2099. if (GNUNET_YES == s->contained)
  2100. automaton_remove_state (dfa, s);
  2101. }
  2102. }
  2103. /**
  2104. * Creates a new NFA fragment. Needs to be cleared using
  2105. * automaton_fragment_clear.
  2106. *
  2107. * @param start starting state
  2108. * @param end end state
  2109. *
  2110. * @return new NFA fragment
  2111. */
  2112. static struct REGEX_INTERNAL_Automaton *
  2113. nfa_fragment_create (struct REGEX_INTERNAL_State *start,
  2114. struct REGEX_INTERNAL_State *end)
  2115. {
  2116. struct REGEX_INTERNAL_Automaton *n;
  2117. n = GNUNET_new (struct REGEX_INTERNAL_Automaton);
  2118. n->type = NFA;
  2119. n->start = NULL;
  2120. n->end = NULL;
  2121. n->state_count = 0;
  2122. if (NULL == start || NULL == end)
  2123. return n;
  2124. automaton_add_state (n, end);
  2125. automaton_add_state (n, start);
  2126. n->state_count = 2;
  2127. n->start = start;
  2128. n->end = end;
  2129. return n;
  2130. }
  2131. /**
  2132. * Adds a list of states to the given automaton 'n'.
  2133. *
  2134. * @param n automaton to which the states should be added
  2135. * @param states_head head of the DLL of states
  2136. * @param states_tail tail of the DLL of states
  2137. */
  2138. static void
  2139. nfa_add_states (struct REGEX_INTERNAL_Automaton *n,
  2140. struct REGEX_INTERNAL_State *states_head,
  2141. struct REGEX_INTERNAL_State *states_tail)
  2142. {
  2143. struct REGEX_INTERNAL_State *s;
  2144. if (NULL == n || NULL == states_head)
  2145. {
  2146. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not add states\n");
  2147. return;
  2148. }
  2149. if (NULL == n->states_head)
  2150. {
  2151. n->states_head = states_head;
  2152. n->states_tail = states_tail;
  2153. return;
  2154. }
  2155. if (NULL != states_head)
  2156. {
  2157. n->states_tail->next = states_head;
  2158. n->states_tail = states_tail;
  2159. }
  2160. for (s = states_head; NULL != s; s = s->next)
  2161. n->state_count++;
  2162. }
  2163. /**
  2164. * Creates a new NFA state. Needs to be freed using automaton_destroy_state.
  2165. *
  2166. * @param ctx context
  2167. * @param accepting is it an accepting state or not
  2168. *
  2169. * @return new NFA state
  2170. */
  2171. static struct REGEX_INTERNAL_State *
  2172. nfa_state_create (struct REGEX_INTERNAL_Context *ctx, int accepting)
  2173. {
  2174. struct REGEX_INTERNAL_State *s;
  2175. s = GNUNET_new (struct REGEX_INTERNAL_State);
  2176. s->id = ctx->state_id++;
  2177. s->accepting = accepting;
  2178. s->marked = GNUNET_NO;
  2179. s->contained = 0;
  2180. s->index = -1;
  2181. s->lowlink = -1;
  2182. s->scc_id = 0;
  2183. s->name = NULL;
  2184. GNUNET_asprintf (&s->name, "s%i", s->id);
  2185. return s;
  2186. }
  2187. /**
  2188. * Calculates the closure set for the given set of states.
  2189. *
  2190. * @param ret set to sorted nfa closure on 'label' (epsilon closure if 'label' is NULL)
  2191. * @param nfa the NFA containing 's'
  2192. * @param states list of states on which to base the closure on
  2193. * @param label transitioning label for which to base the closure on,
  2194. * pass NULL for epsilon transition
  2195. */
  2196. static void
  2197. nfa_closure_set_create (struct REGEX_INTERNAL_StateSet *ret,
  2198. struct REGEX_INTERNAL_Automaton *nfa,
  2199. struct REGEX_INTERNAL_StateSet *states, const char *label)
  2200. {
  2201. struct REGEX_INTERNAL_State *s;
  2202. unsigned int i;
  2203. struct REGEX_INTERNAL_StateSet_MDLL cls_stack;
  2204. struct REGEX_INTERNAL_State *clsstate;
  2205. struct REGEX_INTERNAL_State *currentstate;
  2206. struct REGEX_INTERNAL_Transition *ctran;
  2207. memset (ret, 0, sizeof (struct REGEX_INTERNAL_StateSet));
  2208. if (NULL == states)
  2209. return;
  2210. for (i = 0; i < states->off; i++)
  2211. {
  2212. s = states->states[i];
  2213. /* Add start state to closure only for epsilon closure */
  2214. if (NULL == label)
  2215. state_set_append (ret, s);
  2216. /* initialize work stack */
  2217. cls_stack.head = NULL;
  2218. cls_stack.tail = NULL;
  2219. GNUNET_CONTAINER_MDLL_insert (ST, cls_stack.head, cls_stack.tail, s);
  2220. cls_stack.len = 1;
  2221. while (NULL != (currentstate = cls_stack.tail))
  2222. {
  2223. GNUNET_CONTAINER_MDLL_remove (ST, cls_stack.head, cls_stack.tail,
  2224. currentstate);
  2225. cls_stack.len--;
  2226. for (ctran = currentstate->transitions_head; NULL != ctran;
  2227. ctran = ctran->next)
  2228. {
  2229. if (NULL == (clsstate = ctran->to_state))
  2230. continue;
  2231. if (0 != clsstate->contained)
  2232. continue;
  2233. if (0 != nullstrcmp (label, ctran->label))
  2234. continue;
  2235. state_set_append (ret, clsstate);
  2236. GNUNET_CONTAINER_MDLL_insert_tail (ST, cls_stack.head, cls_stack.tail,
  2237. clsstate);
  2238. cls_stack.len++;
  2239. clsstate->contained = 1;
  2240. }
  2241. }
  2242. }
  2243. for (i = 0; i < ret->off; i++)
  2244. ret->states[i]->contained = 0;
  2245. if (ret->off > 1)
  2246. qsort (ret->states, ret->off, sizeof (struct REGEX_INTERNAL_State *),
  2247. &state_compare);
  2248. }
  2249. /**
  2250. * Pops two NFA fragments (a, b) from the stack and concatenates them (ab)
  2251. *
  2252. * @param ctx context
  2253. */
  2254. static void
  2255. nfa_add_concatenation (struct REGEX_INTERNAL_Context *ctx)
  2256. {
  2257. struct REGEX_INTERNAL_Automaton *a;
  2258. struct REGEX_INTERNAL_Automaton *b;
  2259. struct REGEX_INTERNAL_Automaton *new_nfa;
  2260. b = ctx->stack_tail;
  2261. GNUNET_assert (NULL != b);
  2262. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b);
  2263. a = ctx->stack_tail;
  2264. GNUNET_assert (NULL != a);
  2265. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2266. state_add_transition (ctx, a->end, NULL, b->start);
  2267. a->end->accepting = 0;
  2268. b->end->accepting = 1;
  2269. new_nfa = nfa_fragment_create (NULL, NULL);
  2270. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2271. nfa_add_states (new_nfa, b->states_head, b->states_tail);
  2272. new_nfa->start = a->start;
  2273. new_nfa->end = b->end;
  2274. new_nfa->state_count += a->state_count + b->state_count;
  2275. automaton_fragment_clear (a);
  2276. automaton_fragment_clear (b);
  2277. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2278. }
  2279. /**
  2280. * Pops a NFA fragment from the stack (a) and adds a new fragment (a*)
  2281. *
  2282. * @param ctx context
  2283. */
  2284. static void
  2285. nfa_add_star_op (struct REGEX_INTERNAL_Context *ctx)
  2286. {
  2287. struct REGEX_INTERNAL_Automaton *a;
  2288. struct REGEX_INTERNAL_Automaton *new_nfa;
  2289. struct REGEX_INTERNAL_State *start;
  2290. struct REGEX_INTERNAL_State *end;
  2291. a = ctx->stack_tail;
  2292. if (NULL == a)
  2293. {
  2294. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2295. "nfa_add_star_op failed, because there was no element on the stack");
  2296. return;
  2297. }
  2298. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2299. start = nfa_state_create (ctx, 0);
  2300. end = nfa_state_create (ctx, 1);
  2301. state_add_transition (ctx, start, NULL, a->start);
  2302. state_add_transition (ctx, start, NULL, end);
  2303. state_add_transition (ctx, a->end, NULL, a->start);
  2304. state_add_transition (ctx, a->end, NULL, end);
  2305. a->end->accepting = 0;
  2306. end->accepting = 1;
  2307. new_nfa = nfa_fragment_create (start, end);
  2308. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2309. automaton_fragment_clear (a);
  2310. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2311. }
  2312. /**
  2313. * Pops an NFA fragment (a) from the stack and adds a new fragment (a+)
  2314. *
  2315. * @param ctx context
  2316. */
  2317. static void
  2318. nfa_add_plus_op (struct REGEX_INTERNAL_Context *ctx)
  2319. {
  2320. struct REGEX_INTERNAL_Automaton *a;
  2321. a = ctx->stack_tail;
  2322. if (NULL == a)
  2323. {
  2324. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2325. "nfa_add_plus_op failed, because there was no element on the stack");
  2326. return;
  2327. }
  2328. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2329. state_add_transition (ctx, a->end, NULL, a->start);
  2330. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, a);
  2331. }
  2332. /**
  2333. * Pops an NFA fragment (a) from the stack and adds a new fragment (a?)
  2334. *
  2335. * @param ctx context
  2336. */
  2337. static void
  2338. nfa_add_question_op (struct REGEX_INTERNAL_Context *ctx)
  2339. {
  2340. struct REGEX_INTERNAL_Automaton *a;
  2341. struct REGEX_INTERNAL_Automaton *new_nfa;
  2342. struct REGEX_INTERNAL_State *start;
  2343. struct REGEX_INTERNAL_State *end;
  2344. a = ctx->stack_tail;
  2345. if (NULL == a)
  2346. {
  2347. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2348. "nfa_add_question_op failed, because there was no element on the stack");
  2349. return;
  2350. }
  2351. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2352. start = nfa_state_create (ctx, 0);
  2353. end = nfa_state_create (ctx, 1);
  2354. state_add_transition (ctx, start, NULL, a->start);
  2355. state_add_transition (ctx, start, NULL, end);
  2356. state_add_transition (ctx, a->end, NULL, end);
  2357. a->end->accepting = 0;
  2358. new_nfa = nfa_fragment_create (start, end);
  2359. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2360. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2361. automaton_fragment_clear (a);
  2362. }
  2363. /**
  2364. * Pops two NFA fragments (a, b) from the stack and adds a new NFA fragment that
  2365. * alternates between a and b (a|b)
  2366. *
  2367. * @param ctx context
  2368. */
  2369. static void
  2370. nfa_add_alternation (struct REGEX_INTERNAL_Context *ctx)
  2371. {
  2372. struct REGEX_INTERNAL_Automaton *a;
  2373. struct REGEX_INTERNAL_Automaton *b;
  2374. struct REGEX_INTERNAL_Automaton *new_nfa;
  2375. struct REGEX_INTERNAL_State *start;
  2376. struct REGEX_INTERNAL_State *end;
  2377. b = ctx->stack_tail;
  2378. GNUNET_assert (NULL != b);
  2379. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b);
  2380. a = ctx->stack_tail;
  2381. GNUNET_assert (NULL != a);
  2382. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2383. start = nfa_state_create (ctx, 0);
  2384. end = nfa_state_create (ctx, 1);
  2385. state_add_transition (ctx, start, NULL, a->start);
  2386. state_add_transition (ctx, start, NULL, b->start);
  2387. state_add_transition (ctx, a->end, NULL, end);
  2388. state_add_transition (ctx, b->end, NULL, end);
  2389. a->end->accepting = 0;
  2390. b->end->accepting = 0;
  2391. end->accepting = 1;
  2392. new_nfa = nfa_fragment_create (start, end);
  2393. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2394. nfa_add_states (new_nfa, b->states_head, b->states_tail);
  2395. automaton_fragment_clear (a);
  2396. automaton_fragment_clear (b);
  2397. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2398. }
  2399. /**
  2400. * Adds a new nfa fragment to the stack
  2401. *
  2402. * @param ctx context
  2403. * @param label label for nfa transition
  2404. */
  2405. static void
  2406. nfa_add_label (struct REGEX_INTERNAL_Context *ctx, const char *label)
  2407. {
  2408. struct REGEX_INTERNAL_Automaton *n;
  2409. struct REGEX_INTERNAL_State *start;
  2410. struct REGEX_INTERNAL_State *end;
  2411. GNUNET_assert (NULL != ctx);
  2412. start = nfa_state_create (ctx, 0);
  2413. end = nfa_state_create (ctx, 1);
  2414. state_add_transition (ctx, start, label, end);
  2415. n = nfa_fragment_create (start, end);
  2416. GNUNET_assert (NULL != n);
  2417. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, n);
  2418. }
  2419. /**
  2420. * Initialize a new context
  2421. *
  2422. * @param ctx context
  2423. */
  2424. static void
  2425. REGEX_INTERNAL_context_init (struct REGEX_INTERNAL_Context *ctx)
  2426. {
  2427. if (NULL == ctx)
  2428. {
  2429. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Context was NULL!");
  2430. return;
  2431. }
  2432. ctx->state_id = 0;
  2433. ctx->transition_id = 0;
  2434. ctx->stack_head = NULL;
  2435. ctx->stack_tail = NULL;
  2436. }
  2437. /**
  2438. * Construct an NFA by parsing the regex string of length 'len'.
  2439. *
  2440. * @param regex regular expression string
  2441. * @param len length of the string
  2442. *
  2443. * @return NFA, needs to be freed using REGEX_INTERNAL_destroy_automaton
  2444. */
  2445. struct REGEX_INTERNAL_Automaton *
  2446. REGEX_INTERNAL_construct_nfa (const char *regex, const size_t len)
  2447. {
  2448. struct REGEX_INTERNAL_Context ctx;
  2449. struct REGEX_INTERNAL_Automaton *nfa;
  2450. const char *regexp;
  2451. char curlabel[2];
  2452. char *error_msg;
  2453. unsigned int count;
  2454. unsigned int altcount;
  2455. unsigned int atomcount;
  2456. unsigned int poff;
  2457. unsigned int psize;
  2458. struct
  2459. {
  2460. int altcount;
  2461. int atomcount;
  2462. } *p;
  2463. if (NULL == regex || 0 == strlen (regex) || 0 == len)
  2464. {
  2465. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2466. "Could not parse regex. Empty regex string provided.\n");
  2467. return NULL;
  2468. }
  2469. REGEX_INTERNAL_context_init (&ctx);
  2470. regexp = regex;
  2471. curlabel[1] = '\0';
  2472. p = NULL;
  2473. error_msg = NULL;
  2474. altcount = 0;
  2475. atomcount = 0;
  2476. poff = 0;
  2477. psize = 0;
  2478. for (count = 0; count < len && *regexp; count++, regexp++)
  2479. {
  2480. switch (*regexp)
  2481. {
  2482. case '(':
  2483. if (atomcount > 1)
  2484. {
  2485. --atomcount;
  2486. nfa_add_concatenation (&ctx);
  2487. }
  2488. if (poff == psize)
  2489. GNUNET_array_grow (p, psize, psize * 2 + 4); /* FIXME why *2 +4? */
  2490. p[poff].altcount = altcount;
  2491. p[poff].atomcount = atomcount;
  2492. poff++;
  2493. altcount = 0;
  2494. atomcount = 0;
  2495. break;
  2496. case '|':
  2497. if (0 == atomcount)
  2498. {
  2499. error_msg = "Cannot append '|' to nothing";
  2500. goto error;
  2501. }
  2502. while (--atomcount > 0)
  2503. nfa_add_concatenation (&ctx);
  2504. altcount++;
  2505. break;
  2506. case ')':
  2507. if (0 == poff)
  2508. {
  2509. error_msg = "Missing opening '('";
  2510. goto error;
  2511. }
  2512. if (0 == atomcount)
  2513. {
  2514. /* Ignore this: "()" */
  2515. poff--;
  2516. altcount = p[poff].altcount;
  2517. atomcount = p[poff].atomcount;
  2518. break;
  2519. }
  2520. while (--atomcount > 0)
  2521. nfa_add_concatenation (&ctx);
  2522. for (; altcount > 0; altcount--)
  2523. nfa_add_alternation (&ctx);
  2524. poff--;
  2525. altcount = p[poff].altcount;
  2526. atomcount = p[poff].atomcount;
  2527. atomcount++;
  2528. break;
  2529. case '*':
  2530. if (atomcount == 0)
  2531. {
  2532. error_msg = "Cannot append '*' to nothing";
  2533. goto error;
  2534. }
  2535. nfa_add_star_op (&ctx);
  2536. break;
  2537. case '+':
  2538. if (atomcount == 0)
  2539. {
  2540. error_msg = "Cannot append '+' to nothing";
  2541. goto error;
  2542. }
  2543. nfa_add_plus_op (&ctx);
  2544. break;
  2545. case '?':
  2546. if (atomcount == 0)
  2547. {
  2548. error_msg = "Cannot append '?' to nothing";
  2549. goto error;
  2550. }
  2551. nfa_add_question_op (&ctx);
  2552. break;
  2553. default:
  2554. if (atomcount > 1)
  2555. {
  2556. --atomcount;
  2557. nfa_add_concatenation (&ctx);
  2558. }
  2559. curlabel[0] = *regexp;
  2560. nfa_add_label (&ctx, curlabel);
  2561. atomcount++;
  2562. break;
  2563. }
  2564. }
  2565. if (0 != poff)
  2566. {
  2567. error_msg = "Unbalanced parenthesis";
  2568. goto error;
  2569. }
  2570. while (--atomcount > 0)
  2571. nfa_add_concatenation (&ctx);
  2572. for (; altcount > 0; altcount--)
  2573. nfa_add_alternation (&ctx);
  2574. GNUNET_array_grow (p, psize, 0);
  2575. nfa = ctx.stack_tail;
  2576. GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa);
  2577. if (NULL != ctx.stack_head)
  2578. {
  2579. error_msg = "Creating the NFA failed. NFA stack was not empty!";
  2580. goto error;
  2581. }
  2582. /* Remember the regex that was used to generate this NFA */
  2583. nfa->regex = GNUNET_strdup (regex);
  2584. /* create depth-first numbering of the states for pretty printing */
  2585. REGEX_INTERNAL_automaton_traverse (nfa, NULL, NULL, NULL, &number_states, NULL);
  2586. /* No multistriding added so far */
  2587. nfa->is_multistrided = GNUNET_NO;
  2588. return nfa;
  2589. error:
  2590. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not parse regex: `%s'\n", regex);
  2591. if (NULL != error_msg)
  2592. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "%s\n", error_msg);
  2593. GNUNET_free_non_null (p);
  2594. while (NULL != (nfa = ctx.stack_head))
  2595. {
  2596. GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa);
  2597. REGEX_INTERNAL_automaton_destroy (nfa);
  2598. }
  2599. return NULL;
  2600. }
  2601. /**
  2602. * Create DFA states based on given 'nfa' and starting with 'dfa_state'.
  2603. *
  2604. * @param ctx context.
  2605. * @param nfa NFA automaton.
  2606. * @param dfa DFA automaton.
  2607. * @param dfa_state current dfa state, pass epsilon closure of first nfa state
  2608. * for starting.
  2609. */
  2610. static void
  2611. construct_dfa_states (struct REGEX_INTERNAL_Context *ctx,
  2612. struct REGEX_INTERNAL_Automaton *nfa,
  2613. struct REGEX_INTERNAL_Automaton *dfa,
  2614. struct REGEX_INTERNAL_State *dfa_state)
  2615. {
  2616. struct REGEX_INTERNAL_Transition *ctran;
  2617. struct REGEX_INTERNAL_State *new_dfa_state;
  2618. struct REGEX_INTERNAL_State *state_contains;
  2619. struct REGEX_INTERNAL_State *state_iter;
  2620. struct REGEX_INTERNAL_StateSet tmp;
  2621. struct REGEX_INTERNAL_StateSet nfa_set;
  2622. for (ctran = dfa_state->transitions_head; NULL != ctran; ctran = ctran->next)
  2623. {
  2624. if (NULL == ctran->label || NULL != ctran->to_state)
  2625. continue;
  2626. nfa_closure_set_create (&tmp, nfa, &dfa_state->nfa_set, ctran->label);
  2627. nfa_closure_set_create (&nfa_set, nfa, &tmp, NULL);
  2628. state_set_clear (&tmp);
  2629. state_contains = NULL;
  2630. for (state_iter = dfa->states_head; NULL != state_iter;
  2631. state_iter = state_iter->next)
  2632. {
  2633. if (0 == state_set_compare (&state_iter->nfa_set, &nfa_set))
  2634. {
  2635. state_contains = state_iter;
  2636. break;
  2637. }
  2638. }
  2639. if (NULL == state_contains)
  2640. {
  2641. new_dfa_state = dfa_state_create (ctx, &nfa_set);
  2642. automaton_add_state (dfa, new_dfa_state);
  2643. ctran->to_state = new_dfa_state;
  2644. construct_dfa_states (ctx, nfa, dfa, new_dfa_state);
  2645. }
  2646. else
  2647. {
  2648. ctran->to_state = state_contains;
  2649. state_set_clear (&nfa_set);
  2650. }
  2651. }
  2652. }
  2653. /**
  2654. * Construct DFA for the given 'regex' of length 'len'.
  2655. *
  2656. * Path compression means, that for example a DFA o -> a -> b -> c -> o will be
  2657. * compressed to o -> abc -> o. Note that this parameter influences the
  2658. * non-determinism of states of the resulting NFA in the DHT (number of outgoing
  2659. * edges with the same label). For example for an application that stores IPv4
  2660. * addresses as bitstrings it could make sense to limit the path compression to
  2661. * 4 or 8.
  2662. *
  2663. * @param regex regular expression string.
  2664. * @param len length of the regular expression.
  2665. * @param max_path_len limit the path compression length to the
  2666. * given value. If set to 1, no path compression is applied. Set to 0 for
  2667. * maximal possible path compression (generally not desireable).
  2668. * @return DFA, needs to be freed using REGEX_INTERNAL_automaton_destroy.
  2669. */
  2670. struct REGEX_INTERNAL_Automaton *
  2671. REGEX_INTERNAL_construct_dfa (const char *regex, const size_t len,
  2672. unsigned int max_path_len)
  2673. {
  2674. struct REGEX_INTERNAL_Context ctx;
  2675. struct REGEX_INTERNAL_Automaton *dfa;
  2676. struct REGEX_INTERNAL_Automaton *nfa;
  2677. struct REGEX_INTERNAL_StateSet nfa_start_eps_cls;
  2678. struct REGEX_INTERNAL_StateSet singleton_set;
  2679. REGEX_INTERNAL_context_init (&ctx);
  2680. /* Create NFA */
  2681. nfa = REGEX_INTERNAL_construct_nfa (regex, len);
  2682. if (NULL == nfa)
  2683. {
  2684. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2685. "Could not create DFA, because NFA creation failed\n");
  2686. return NULL;
  2687. }
  2688. dfa = GNUNET_new (struct REGEX_INTERNAL_Automaton);
  2689. dfa->type = DFA;
  2690. dfa->regex = GNUNET_strdup (regex);
  2691. /* Create DFA start state from epsilon closure */
  2692. memset (&singleton_set, 0, sizeof (struct REGEX_INTERNAL_StateSet));
  2693. state_set_append (&singleton_set, nfa->start);
  2694. nfa_closure_set_create (&nfa_start_eps_cls, nfa, &singleton_set, NULL);
  2695. state_set_clear (&singleton_set);
  2696. dfa->start = dfa_state_create (&ctx, &nfa_start_eps_cls);
  2697. automaton_add_state (dfa, dfa->start);
  2698. construct_dfa_states (&ctx, nfa, dfa, dfa->start);
  2699. REGEX_INTERNAL_automaton_destroy (nfa);
  2700. /* Minimize DFA */
  2701. if (GNUNET_OK != dfa_minimize (&ctx, dfa))
  2702. {
  2703. REGEX_INTERNAL_automaton_destroy (dfa);
  2704. return NULL;
  2705. }
  2706. /* Create proofs and hashes for all states */
  2707. if (GNUNET_OK != automaton_create_proofs (dfa))
  2708. {
  2709. REGEX_INTERNAL_automaton_destroy (dfa);
  2710. return NULL;
  2711. }
  2712. /* Compress linear DFA paths */
  2713. if (1 != max_path_len)
  2714. dfa_compress_paths (&ctx, dfa, max_path_len);
  2715. return dfa;
  2716. }
  2717. /**
  2718. * Free the memory allocated by constructing the REGEX_INTERNAL_Automaton data
  2719. * structure.
  2720. *
  2721. * @param a automaton to be destroyed
  2722. */
  2723. void
  2724. REGEX_INTERNAL_automaton_destroy (struct REGEX_INTERNAL_Automaton *a)
  2725. {
  2726. struct REGEX_INTERNAL_State *s;
  2727. struct REGEX_INTERNAL_State *next_state;
  2728. if (NULL == a)
  2729. return;
  2730. GNUNET_free_non_null (a->regex);
  2731. GNUNET_free_non_null (a->canonical_regex);
  2732. for (s = a->states_head; NULL != s; s = next_state)
  2733. {
  2734. next_state = s->next;
  2735. GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s);
  2736. automaton_destroy_state (s);
  2737. }
  2738. GNUNET_free (a);
  2739. }
  2740. /**
  2741. * Evaluates the given string using the given DFA automaton
  2742. *
  2743. * @param a automaton, type must be DFA
  2744. * @param string string that should be evaluated
  2745. *
  2746. * @return 0 if string matches, non 0 otherwise
  2747. */
  2748. static int
  2749. evaluate_dfa (struct REGEX_INTERNAL_Automaton *a, const char *string)
  2750. {
  2751. const char *strp;
  2752. struct REGEX_INTERNAL_State *s;
  2753. unsigned int step_len;
  2754. if (DFA != a->type)
  2755. {
  2756. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2757. "Tried to evaluate DFA, but NFA automaton given");
  2758. return -1;
  2759. }
  2760. s = a->start;
  2761. /* If the string is empty but the starting state is accepting, we accept. */
  2762. if ((NULL == string || 0 == strlen (string)) && s->accepting)
  2763. return 0;
  2764. for (strp = string; NULL != strp && *strp; strp += step_len)
  2765. {
  2766. step_len = dfa_move (&s, strp);
  2767. if (NULL == s)
  2768. break;
  2769. }
  2770. if (NULL != s && s->accepting)
  2771. return 0;
  2772. return 1;
  2773. }
  2774. /**
  2775. * Evaluates the given string using the given NFA automaton
  2776. *
  2777. * @param a automaton, type must be NFA
  2778. * @param string string that should be evaluated
  2779. *
  2780. * @return 0 if string matches, non 0 otherwise
  2781. */
  2782. static int
  2783. evaluate_nfa (struct REGEX_INTERNAL_Automaton *a, const char *string)
  2784. {
  2785. const char *strp;
  2786. char str[2];
  2787. struct REGEX_INTERNAL_State *s;
  2788. struct REGEX_INTERNAL_StateSet sset;
  2789. struct REGEX_INTERNAL_StateSet new_sset;
  2790. struct REGEX_INTERNAL_StateSet singleton_set;
  2791. unsigned int i;
  2792. int result;
  2793. if (NFA != a->type)
  2794. {
  2795. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2796. "Tried to evaluate NFA, but DFA automaton given");
  2797. return -1;
  2798. }
  2799. /* If the string is empty but the starting state is accepting, we accept. */
  2800. if ((NULL == string || 0 == strlen (string)) && a->start->accepting)
  2801. return 0;
  2802. result = 1;
  2803. memset (&singleton_set, 0, sizeof (struct REGEX_INTERNAL_StateSet));
  2804. state_set_append (&singleton_set, a->start);
  2805. nfa_closure_set_create (&sset, a, &singleton_set, NULL);
  2806. state_set_clear (&singleton_set);
  2807. str[1] = '\0';
  2808. for (strp = string; NULL != strp && *strp; strp++)
  2809. {
  2810. str[0] = *strp;
  2811. nfa_closure_set_create (&new_sset, a, &sset, str);
  2812. state_set_clear (&sset);
  2813. nfa_closure_set_create (&sset, a, &new_sset, 0);
  2814. state_set_clear (&new_sset);
  2815. }
  2816. for (i = 0; i < sset.off; i++)
  2817. {
  2818. s = sset.states[i];
  2819. if ( (NULL != s) && (s->accepting) )
  2820. {
  2821. result = 0;
  2822. break;
  2823. }
  2824. }
  2825. state_set_clear (&sset);
  2826. return result;
  2827. }
  2828. /**
  2829. * Evaluates the given 'string' against the given compiled regex
  2830. *
  2831. * @param a automaton
  2832. * @param string string to check
  2833. *
  2834. * @return 0 if string matches, non 0 otherwise
  2835. */
  2836. int
  2837. REGEX_INTERNAL_eval (struct REGEX_INTERNAL_Automaton *a, const char *string)
  2838. {
  2839. int result;
  2840. switch (a->type)
  2841. {
  2842. case DFA:
  2843. result = evaluate_dfa (a, string);
  2844. break;
  2845. case NFA:
  2846. result = evaluate_nfa (a, string);
  2847. break;
  2848. default:
  2849. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2850. "Evaluating regex failed, automaton has no type!\n");
  2851. result = GNUNET_SYSERR;
  2852. break;
  2853. }
  2854. return result;
  2855. }
  2856. /**
  2857. * Get the canonical regex of the given automaton.
  2858. * When constructing the automaton a proof is computed for each state,
  2859. * consisting of the regular expression leading to this state. A complete
  2860. * regex for the automaton can be computed by combining these proofs.
  2861. * As of now this function is only useful for testing.
  2862. *
  2863. * @param a automaton for which the canonical regex should be returned.
  2864. *
  2865. * @return
  2866. */
  2867. const char *
  2868. REGEX_INTERNAL_get_canonical_regex (struct REGEX_INTERNAL_Automaton *a)
  2869. {
  2870. if (NULL == a)
  2871. return NULL;
  2872. return a->canonical_regex;
  2873. }
  2874. /**
  2875. * Get the number of transitions that are contained in the given automaton.
  2876. *
  2877. * @param a automaton for which the number of transitions should be returned.
  2878. *
  2879. * @return number of transitions in the given automaton.
  2880. */
  2881. unsigned int
  2882. REGEX_INTERNAL_get_transition_count (struct REGEX_INTERNAL_Automaton *a)
  2883. {
  2884. unsigned int t_count;
  2885. struct REGEX_INTERNAL_State *s;
  2886. if (NULL == a)
  2887. return 0;
  2888. t_count = 0;
  2889. for (s = a->states_head; NULL != s; s = s->next)
  2890. t_count += s->transition_count;
  2891. return t_count;
  2892. }
  2893. /**
  2894. * Get the first key for the given 'input_string'. This hashes the first x bits
  2895. * of the 'input_string'.
  2896. *
  2897. * @param input_string string.
  2898. * @param string_len length of the 'input_string'.
  2899. * @param key pointer to where to write the hash code.
  2900. *
  2901. * @return number of bits of 'input_string' that have been consumed
  2902. * to construct the key
  2903. */
  2904. size_t
  2905. REGEX_INTERNAL_get_first_key (const char *input_string, size_t string_len,
  2906. struct GNUNET_HashCode * key)
  2907. {
  2908. size_t size;
  2909. size = string_len < GNUNET_REGEX_INITIAL_BYTES ? string_len :
  2910. GNUNET_REGEX_INITIAL_BYTES;
  2911. if (NULL == input_string)
  2912. {
  2913. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2914. "Given input string was NULL!\n");
  2915. return 0;
  2916. }
  2917. GNUNET_CRYPTO_hash (input_string, size, key);
  2918. return size;
  2919. }
  2920. /**
  2921. * Recursive function that calls the iterator for each synthetic start state.
  2922. *
  2923. * @param min_len minimum length of the path in the graph.
  2924. * @param max_len maximum length of the path in the graph.
  2925. * @param consumed_string string consumed by traversing the graph till this state.
  2926. * @param state current state of the automaton.
  2927. * @param iterator iterator function called for each edge.
  2928. * @param iterator_cls closure for the iterator function.
  2929. */
  2930. static void
  2931. iterate_initial_edge (const unsigned int min_len, const unsigned int max_len,
  2932. char *consumed_string, struct REGEX_INTERNAL_State *state,
  2933. REGEX_INTERNAL_KeyIterator iterator, void *iterator_cls)
  2934. {
  2935. char *temp;
  2936. struct REGEX_INTERNAL_Transition *t;
  2937. unsigned int num_edges = state->transition_count;
  2938. struct REGEX_BLOCK_Edge edges[num_edges];
  2939. struct REGEX_BLOCK_Edge edge[1];
  2940. struct GNUNET_HashCode hash;
  2941. struct GNUNET_HashCode hash_new;
  2942. unsigned int cur_len;
  2943. if (NULL != consumed_string)
  2944. cur_len = strlen (consumed_string);
  2945. else
  2946. cur_len = 0;
  2947. if ((cur_len >= min_len || GNUNET_YES == state->accepting) && cur_len > 0 &&
  2948. NULL != consumed_string)
  2949. {
  2950. if (cur_len <= max_len)
  2951. {
  2952. if (state->proof != NULL && 0 != strcmp (consumed_string, state->proof))
  2953. {
  2954. (void) state_get_edges (state, edges);
  2955. GNUNET_CRYPTO_hash (consumed_string, strlen (consumed_string), &hash);
  2956. iterator (iterator_cls, &hash, consumed_string, state->accepting,
  2957. num_edges, edges);
  2958. }
  2959. if (GNUNET_YES == state->accepting && cur_len > 1 &&
  2960. state->transition_count < 1 && cur_len < max_len)
  2961. {
  2962. /* Special case for regex consisting of just a string that is shorter than
  2963. * max_len */
  2964. edge[0].label = &consumed_string[cur_len - 1];
  2965. edge[0].destination = state->hash;
  2966. temp = GNUNET_strdup (consumed_string);
  2967. temp[cur_len - 1] = '\0';
  2968. GNUNET_CRYPTO_hash (temp, cur_len - 1, &hash_new);
  2969. iterator (iterator_cls, &hash_new, temp, GNUNET_NO, 1, edge);
  2970. GNUNET_free (temp);
  2971. }
  2972. }
  2973. else /* cur_len > max_len */
  2974. {
  2975. /* Case where the concatenated labels are longer than max_len, then split. */
  2976. edge[0].label = &consumed_string[max_len];
  2977. edge[0].destination = state->hash;
  2978. temp = GNUNET_strdup (consumed_string);
  2979. temp[max_len] = '\0';
  2980. GNUNET_CRYPTO_hash (temp, max_len, &hash);
  2981. iterator (iterator_cls, &hash, temp, GNUNET_NO, 1, edge);
  2982. GNUNET_free (temp);
  2983. }
  2984. }
  2985. if (cur_len < max_len)
  2986. {
  2987. for (t = state->transitions_head; NULL != t; t = t->next)
  2988. {
  2989. if (NULL != consumed_string)
  2990. GNUNET_asprintf (&temp, "%s%s", consumed_string, t->label);
  2991. else
  2992. GNUNET_asprintf (&temp, "%s", t->label);
  2993. iterate_initial_edge (min_len, max_len, temp, t->to_state, iterator,
  2994. iterator_cls);
  2995. GNUNET_free (temp);
  2996. }
  2997. }
  2998. }
  2999. /**
  3000. * Iterate over all edges starting from start state of automaton 'a'. Calling
  3001. * iterator for each edge.
  3002. *
  3003. * @param a automaton.
  3004. * @param iterator iterator called for each edge.
  3005. * @param iterator_cls closure.
  3006. */
  3007. void
  3008. REGEX_INTERNAL_iterate_all_edges (struct REGEX_INTERNAL_Automaton *a,
  3009. REGEX_INTERNAL_KeyIterator iterator,
  3010. void *iterator_cls)
  3011. {
  3012. struct REGEX_INTERNAL_State *s;
  3013. for (s = a->states_head; NULL != s; s = s->next)
  3014. {
  3015. struct REGEX_BLOCK_Edge edges[s->transition_count];
  3016. unsigned int num_edges;
  3017. num_edges = state_get_edges (s, edges);
  3018. if ( ( (NULL != s->proof) &&
  3019. (0 < strlen (s->proof)) ) || s->accepting)
  3020. iterator (iterator_cls, &s->hash, s->proof,
  3021. s->accepting,
  3022. num_edges, edges);
  3023. s->marked = GNUNET_NO;
  3024. }
  3025. iterate_initial_edge (GNUNET_REGEX_INITIAL_BYTES,
  3026. GNUNET_REGEX_INITIAL_BYTES,
  3027. NULL, a->start,
  3028. iterator, iterator_cls);
  3029. }
  3030. /**
  3031. * Struct to hold all the relevant state information in the HashMap.
  3032. *
  3033. * Contains the same info as the Regex Iterator parametes except the key,
  3034. * which comes directly from the HashMap iterator.
  3035. */
  3036. struct temporal_state_store {
  3037. int reachable;
  3038. char *proof;
  3039. int accepting;
  3040. int num_edges;
  3041. struct REGEX_BLOCK_Edge *edges;
  3042. };
  3043. /**
  3044. * Store regex iterator and cls in one place to pass to the hashmap iterator.
  3045. */
  3046. struct client_iterator {
  3047. REGEX_INTERNAL_KeyIterator iterator;
  3048. void *iterator_cls;
  3049. };
  3050. /**
  3051. * Iterator over all edges of a dfa. Stores all of them in a HashMap
  3052. * for later reachability marking.
  3053. *
  3054. * @param cls Closure (HashMap)
  3055. * @param key hash for current state.
  3056. * @param proof proof for current state
  3057. * @param accepting GNUNET_YES if this is an accepting state, GNUNET_NO if not.
  3058. * @param num_edges number of edges leaving current state.
  3059. * @param edges edges leaving current state.
  3060. */
  3061. static void
  3062. store_all_states (void *cls,
  3063. const struct GNUNET_HashCode *key,
  3064. const char *proof,
  3065. int accepting,
  3066. unsigned int num_edges,
  3067. const struct REGEX_BLOCK_Edge *edges)
  3068. {
  3069. struct GNUNET_CONTAINER_MultiHashMap *hm = cls;
  3070. struct temporal_state_store *tmp;
  3071. size_t edges_size;
  3072. tmp = GNUNET_new (struct temporal_state_store);
  3073. tmp->reachable = GNUNET_NO;
  3074. tmp->proof = GNUNET_strdup (proof);
  3075. tmp->accepting = accepting;
  3076. tmp->num_edges = num_edges;
  3077. edges_size = sizeof (struct REGEX_BLOCK_Edge) * num_edges;
  3078. tmp->edges = GNUNET_malloc (edges_size);
  3079. memcpy(tmp->edges, edges, edges_size);
  3080. GNUNET_CONTAINER_multihashmap_put (hm, key, tmp,
  3081. GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_FAST);
  3082. }
  3083. /**
  3084. * Mark state as reachable and call recursively on all its edges.
  3085. *
  3086. * If already marked as reachable, do nothing.
  3087. *
  3088. * @param state State to mark as reachable.
  3089. * @param hm HashMap which stores all the states indexed by key.
  3090. */
  3091. static void
  3092. mark_as_reachable (struct temporal_state_store *state,
  3093. struct GNUNET_CONTAINER_MultiHashMap *hm)
  3094. {
  3095. struct temporal_state_store *child;
  3096. unsigned int i;
  3097. if (GNUNET_YES == state->reachable)
  3098. /* visited */
  3099. return;
  3100. state->reachable = GNUNET_YES;
  3101. for (i = 0; i < state->num_edges; i++)
  3102. {
  3103. child = GNUNET_CONTAINER_multihashmap_get (hm,
  3104. &state->edges[i].destination);
  3105. if (NULL == child)
  3106. {
  3107. GNUNET_break (0);
  3108. continue;
  3109. }
  3110. mark_as_reachable (child, hm);
  3111. }
  3112. }
  3113. /**
  3114. * Iterator over hash map entries to mark the ones that are reachable.
  3115. *
  3116. * @param cls closure
  3117. * @param key current key code
  3118. * @param value value in the hash map
  3119. * @return #GNUNET_YES if we should continue to iterate,
  3120. * #GNUNET_NO if not.
  3121. */
  3122. static int
  3123. reachability_iterator (void *cls,
  3124. const struct GNUNET_HashCode *key,
  3125. void *value)
  3126. {
  3127. struct GNUNET_CONTAINER_MultiHashMap *hm = cls;
  3128. struct temporal_state_store *state = value;
  3129. if (GNUNET_YES == state->reachable)
  3130. /* already visited and marked */
  3131. return GNUNET_YES;
  3132. if (GNUNET_REGEX_INITIAL_BYTES > strlen (state->proof) &&
  3133. GNUNET_NO == state->accepting)
  3134. /* not directly reachable */
  3135. return GNUNET_YES;
  3136. mark_as_reachable (state, hm);
  3137. return GNUNET_YES;
  3138. }
  3139. /**
  3140. * Iterator over hash map entries.
  3141. * Calling the callback on the ones marked as reachables.
  3142. *
  3143. * @param cls closure
  3144. * @param key current key code
  3145. * @param value value in the hash map
  3146. * @return #GNUNET_YES if we should continue to iterate,
  3147. * #GNUNET_NO if not.
  3148. */
  3149. static int
  3150. iterate_reachables (void *cls,
  3151. const struct GNUNET_HashCode *key,
  3152. void *value)
  3153. {
  3154. struct client_iterator *ci = cls;
  3155. struct temporal_state_store *state = value;
  3156. if (GNUNET_YES == state->reachable)
  3157. {
  3158. ci->iterator (ci->iterator_cls, key,
  3159. state->proof, state->accepting,
  3160. state->num_edges, state->edges);
  3161. }
  3162. GNUNET_free (state->edges);
  3163. GNUNET_free (state->proof);
  3164. GNUNET_free (state);
  3165. return GNUNET_YES;
  3166. }
  3167. /**
  3168. * Iterate over all edges of automaton 'a' that are reachable from a state with
  3169. * a proof of at least GNUNET_REGEX_INITIAL_BYTES characters.
  3170. *
  3171. * Call the iterator for each such edge.
  3172. *
  3173. * @param a automaton.
  3174. * @param iterator iterator called for each reachable edge.
  3175. * @param iterator_cls closure.
  3176. */
  3177. void
  3178. REGEX_INTERNAL_iterate_reachable_edges (struct REGEX_INTERNAL_Automaton *a,
  3179. REGEX_INTERNAL_KeyIterator iterator,
  3180. void *iterator_cls)
  3181. {
  3182. struct GNUNET_CONTAINER_MultiHashMap *hm;
  3183. struct client_iterator ci;
  3184. hm = GNUNET_CONTAINER_multihashmap_create (a->state_count * 2, GNUNET_NO);
  3185. ci.iterator = iterator;
  3186. ci.iterator_cls = iterator_cls;
  3187. REGEX_INTERNAL_iterate_all_edges (a, &store_all_states, hm);
  3188. GNUNET_CONTAINER_multihashmap_iterate (hm, &reachability_iterator, hm);
  3189. GNUNET_CONTAINER_multihashmap_iterate (hm, &iterate_reachables, &ci);
  3190. GNUNET_CONTAINER_multihashmap_destroy (hm);
  3191. }
  3192. /* end of regex_internal.c */