regex_internal.c 101 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718
  1. /*
  2. This file is part of GNUnet
  3. Copyright (C) 2012 GNUnet e.V.
  4. GNUnet is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Affero General Public License as published
  6. by the Free Software Foundation, either version 3 of the License,
  7. or (at your option) any later version.
  8. GNUnet is distributed in the hope that it will be useful, but
  9. WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Affero General Public License for more details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>.
  14. SPDX-License-Identifier: AGPL3.0-or-later
  15. */
  16. /**
  17. * @file src/regex/regex_internal.c
  18. * @brief library to create Deterministic Finite Automatons (DFAs) from regular
  19. * expressions (regexes).
  20. * @author Maximilian Szengel
  21. */
  22. #include "platform.h"
  23. #include "gnunet_util_lib.h"
  24. #include "gnunet_regex_service.h"
  25. #include "regex_internal_lib.h"
  26. #include "regex_internal.h"
  27. /**
  28. * Set this to #GNUNET_YES to enable state naming. Used to debug NFA->DFA
  29. * creation. Disabled by default for better performance.
  30. */
  31. #define REGEX_DEBUG_DFA GNUNET_NO
  32. /**
  33. * Set of states using MDLL API.
  34. */
  35. struct REGEX_INTERNAL_StateSet_MDLL
  36. {
  37. /**
  38. * MDLL of states.
  39. */
  40. struct REGEX_INTERNAL_State *head;
  41. /**
  42. * MDLL of states.
  43. */
  44. struct REGEX_INTERNAL_State *tail;
  45. /**
  46. * Length of the MDLL.
  47. */
  48. unsigned int len;
  49. };
  50. /**
  51. * Append state to the given StateSet.
  52. *
  53. * @param set set to be modified
  54. * @param state state to be appended
  55. */
  56. static void
  57. state_set_append (struct REGEX_INTERNAL_StateSet *set,
  58. struct REGEX_INTERNAL_State *state)
  59. {
  60. if (set->off == set->size)
  61. GNUNET_array_grow (set->states, set->size, set->size * 2 + 4);
  62. set->states[set->off++] = state;
  63. }
  64. /**
  65. * Compare two strings for equality. If either is NULL they are not equal.
  66. *
  67. * @param str1 first string for comparison.
  68. * @param str2 second string for comparison.
  69. *
  70. * @return 0 if the strings are the same or both NULL, 1 or -1 if not.
  71. */
  72. static int
  73. nullstrcmp (const char *str1, const char *str2)
  74. {
  75. if ((NULL == str1) != (NULL == str2))
  76. return -1;
  77. if ((NULL == str1) && (NULL == str2))
  78. return 0;
  79. return strcmp (str1, str2);
  80. }
  81. /**
  82. * Adds a transition from one state to another on @a label. Does not add
  83. * duplicate states.
  84. *
  85. * @param ctx context
  86. * @param from_state starting state for the transition
  87. * @param label transition label
  88. * @param to_state state to where the transition should point to
  89. */
  90. static void
  91. state_add_transition (struct REGEX_INTERNAL_Context *ctx,
  92. struct REGEX_INTERNAL_State *from_state,
  93. const char *label,
  94. struct REGEX_INTERNAL_State *to_state)
  95. {
  96. struct REGEX_INTERNAL_Transition *t;
  97. struct REGEX_INTERNAL_Transition *oth;
  98. if (NULL == from_state)
  99. {
  100. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not create Transition.\n");
  101. return;
  102. }
  103. /* Do not add duplicate state transitions */
  104. for (t = from_state->transitions_head; NULL != t; t = t->next)
  105. {
  106. if ((t->to_state == to_state) && (0 == nullstrcmp (t->label, label)) &&
  107. (t->from_state == from_state) )
  108. return;
  109. }
  110. /* sort transitions by label */
  111. for (oth = from_state->transitions_head; NULL != oth; oth = oth->next)
  112. {
  113. if (0 < nullstrcmp (oth->label, label))
  114. break;
  115. }
  116. t = GNUNET_new (struct REGEX_INTERNAL_Transition);
  117. if (NULL != ctx)
  118. t->id = ctx->transition_id++;
  119. if (NULL != label)
  120. t->label = GNUNET_strdup (label);
  121. else
  122. t->label = NULL;
  123. t->to_state = to_state;
  124. t->from_state = from_state;
  125. /* Add outgoing transition to 'from_state' */
  126. from_state->transition_count++;
  127. GNUNET_CONTAINER_DLL_insert_before (from_state->transitions_head,
  128. from_state->transitions_tail,
  129. oth,
  130. t);
  131. }
  132. /**
  133. * Remove a 'transition' from 'state'.
  134. *
  135. * @param state state from which the to-be-removed transition originates.
  136. * @param transition transition that should be removed from state 'state'.
  137. */
  138. static void
  139. state_remove_transition (struct REGEX_INTERNAL_State *state,
  140. struct REGEX_INTERNAL_Transition *transition)
  141. {
  142. if ((NULL == state) || (NULL == transition))
  143. return;
  144. if (transition->from_state != state)
  145. return;
  146. GNUNET_free (transition->label);
  147. state->transition_count--;
  148. GNUNET_CONTAINER_DLL_remove (state->transitions_head,
  149. state->transitions_tail,
  150. transition);
  151. GNUNET_free (transition);
  152. }
  153. /**
  154. * Compare two states. Used for sorting.
  155. *
  156. * @param a first state
  157. * @param b second state
  158. *
  159. * @return an integer less than, equal to, or greater than zero
  160. * if the first argument is considered to be respectively
  161. * less than, equal to, or greater than the second.
  162. */
  163. static int
  164. state_compare (const void *a, const void *b)
  165. {
  166. struct REGEX_INTERNAL_State **s1 = (struct REGEX_INTERNAL_State **) a;
  167. struct REGEX_INTERNAL_State **s2 = (struct REGEX_INTERNAL_State **) b;
  168. return (*s1)->id - (*s2)->id;
  169. }
  170. /**
  171. * Get all edges leaving state @a s.
  172. *
  173. * @param s state.
  174. * @param edges all edges leaving @a s, expected to be allocated and have enough
  175. * space for `s->transitions_count` elements.
  176. *
  177. * @return number of edges.
  178. */
  179. static unsigned int
  180. state_get_edges (struct REGEX_INTERNAL_State *s, struct REGEX_BLOCK_Edge *edges)
  181. {
  182. struct REGEX_INTERNAL_Transition *t;
  183. unsigned int count;
  184. if (NULL == s)
  185. return 0;
  186. count = 0;
  187. for (t = s->transitions_head; NULL != t; t = t->next)
  188. {
  189. if (NULL != t->to_state)
  190. {
  191. edges[count].label = t->label;
  192. edges[count].destination = t->to_state->hash;
  193. count++;
  194. }
  195. }
  196. return count;
  197. }
  198. /**
  199. * Compare to state sets by comparing the id's of the states that are contained
  200. * in each set. Both sets are expected to be sorted by id!
  201. *
  202. * @param sset1 first state set
  203. * @param sset2 second state set
  204. * @return 0 if the sets are equal, otherwise non-zero
  205. */
  206. static int
  207. state_set_compare (struct REGEX_INTERNAL_StateSet *sset1,
  208. struct REGEX_INTERNAL_StateSet *sset2)
  209. {
  210. int result;
  211. unsigned int i;
  212. if ((NULL == sset1) || (NULL == sset2))
  213. return 1;
  214. result = sset1->off - sset2->off;
  215. if (result < 0)
  216. return -1;
  217. if (result > 0)
  218. return 1;
  219. for (i = 0; i < sset1->off; i++)
  220. if (0 != (result = state_compare (&sset1->states[i], &sset2->states[i])))
  221. break;
  222. return result;
  223. }
  224. /**
  225. * Clears the given StateSet 'set'
  226. *
  227. * @param set set to be cleared
  228. */
  229. static void
  230. state_set_clear (struct REGEX_INTERNAL_StateSet *set)
  231. {
  232. GNUNET_array_grow (set->states, set->size, 0);
  233. set->off = 0;
  234. }
  235. /**
  236. * Clears an automaton fragment. Does not destroy the states inside the
  237. * automaton.
  238. *
  239. * @param a automaton to be cleared
  240. */
  241. static void
  242. automaton_fragment_clear (struct REGEX_INTERNAL_Automaton *a)
  243. {
  244. if (NULL == a)
  245. return;
  246. a->start = NULL;
  247. a->end = NULL;
  248. a->states_head = NULL;
  249. a->states_tail = NULL;
  250. a->state_count = 0;
  251. GNUNET_free (a);
  252. }
  253. /**
  254. * Frees the memory used by State @a s
  255. *
  256. * @param s state that should be destroyed
  257. */
  258. static void
  259. automaton_destroy_state (struct REGEX_INTERNAL_State *s)
  260. {
  261. struct REGEX_INTERNAL_Transition *t;
  262. struct REGEX_INTERNAL_Transition *next_t;
  263. if (NULL == s)
  264. return;
  265. GNUNET_free (s->name);
  266. GNUNET_free (s->proof);
  267. state_set_clear (&s->nfa_set);
  268. for (t = s->transitions_head; NULL != t; t = next_t)
  269. {
  270. next_t = t->next;
  271. state_remove_transition (s, t);
  272. }
  273. GNUNET_free (s);
  274. }
  275. /**
  276. * Remove a state from the given automaton 'a'. Always use this function when
  277. * altering the states of an automaton. Will also remove all transitions leading
  278. * to this state, before destroying it.
  279. *
  280. * @param a automaton
  281. * @param s state to remove
  282. */
  283. static void
  284. automaton_remove_state (struct REGEX_INTERNAL_Automaton *a,
  285. struct REGEX_INTERNAL_State *s)
  286. {
  287. struct REGEX_INTERNAL_State *s_check;
  288. struct REGEX_INTERNAL_Transition *t_check;
  289. struct REGEX_INTERNAL_Transition *t_check_next;
  290. if ((NULL == a) || (NULL == s))
  291. return;
  292. /* remove all transitions leading to this state */
  293. for (s_check = a->states_head; NULL != s_check; s_check = s_check->next)
  294. {
  295. for (t_check = s_check->transitions_head; NULL != t_check;
  296. t_check = t_check_next)
  297. {
  298. t_check_next = t_check->next;
  299. if (t_check->to_state == s)
  300. state_remove_transition (s_check, t_check);
  301. }
  302. }
  303. /* remove state */
  304. GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s);
  305. a->state_count--;
  306. automaton_destroy_state (s);
  307. }
  308. /**
  309. * Merge two states into one. Will merge 's1' and 's2' into 's1' and destroy
  310. * 's2'. 's1' will contain all (non-duplicate) outgoing transitions of 's2'.
  311. *
  312. * @param ctx context
  313. * @param a automaton
  314. * @param s1 first state
  315. * @param s2 second state, will be destroyed
  316. */
  317. static void
  318. automaton_merge_states (struct REGEX_INTERNAL_Context *ctx,
  319. struct REGEX_INTERNAL_Automaton *a,
  320. struct REGEX_INTERNAL_State *s1,
  321. struct REGEX_INTERNAL_State *s2)
  322. {
  323. struct REGEX_INTERNAL_State *s_check;
  324. struct REGEX_INTERNAL_Transition *t_check;
  325. struct REGEX_INTERNAL_Transition *t;
  326. struct REGEX_INTERNAL_Transition *t_next;
  327. int is_dup;
  328. if (s1 == s2)
  329. return;
  330. /* 1. Make all transitions pointing to s2 point to s1, unless this transition
  331. * does not already exists, if it already exists remove transition. */
  332. for (s_check = a->states_head; NULL != s_check; s_check = s_check->next)
  333. {
  334. for (t_check = s_check->transitions_head; NULL != t_check; t_check = t_next)
  335. {
  336. t_next = t_check->next;
  337. if (s2 == t_check->to_state)
  338. {
  339. is_dup = GNUNET_NO;
  340. for (t = t_check->from_state->transitions_head; NULL != t; t = t->next)
  341. {
  342. if ((t->to_state == s1) && (0 == strcmp (t_check->label, t->label)) )
  343. is_dup = GNUNET_YES;
  344. }
  345. if (GNUNET_NO == is_dup)
  346. t_check->to_state = s1;
  347. else
  348. state_remove_transition (t_check->from_state, t_check);
  349. }
  350. }
  351. }
  352. /* 2. Add all transitions from s2 to sX to s1 */
  353. for (t_check = s2->transitions_head; NULL != t_check; t_check = t_check->next)
  354. {
  355. if (t_check->to_state != s1)
  356. state_add_transition (ctx, s1, t_check->label, t_check->to_state);
  357. }
  358. /* 3. Rename s1 to {s1,s2} */
  359. #if REGEX_DEBUG_DFA
  360. char *new_name;
  361. new_name = s1->name;
  362. GNUNET_asprintf (&s1->name, "{%s,%s}", new_name, s2->name);
  363. GNUNET_free (new_name);
  364. #endif
  365. /* remove state */
  366. GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s2);
  367. a->state_count--;
  368. automaton_destroy_state (s2);
  369. }
  370. /**
  371. * Add a state to the automaton 'a', always use this function to alter the
  372. * states DLL of the automaton.
  373. *
  374. * @param a automaton to add the state to
  375. * @param s state that should be added
  376. */
  377. static void
  378. automaton_add_state (struct REGEX_INTERNAL_Automaton *a,
  379. struct REGEX_INTERNAL_State *s)
  380. {
  381. GNUNET_CONTAINER_DLL_insert (a->states_head, a->states_tail, s);
  382. a->state_count++;
  383. }
  384. /**
  385. * Depth-first traversal (DFS) of all states that are reachable from state
  386. * 's'. Performs 'action' on each visited state.
  387. *
  388. * @param s start state.
  389. * @param marks an array of size a->state_count to remember which state was
  390. * already visited.
  391. * @param count current count of the state.
  392. * @param check function that is checked before advancing on each transition
  393. * in the DFS.
  394. * @param check_cls closure for check.
  395. * @param action action to be performed on each state.
  396. * @param action_cls closure for action.
  397. */
  398. static void
  399. automaton_state_traverse (struct REGEX_INTERNAL_State *s,
  400. int *marks,
  401. unsigned int *count,
  402. REGEX_INTERNAL_traverse_check check,
  403. void *check_cls,
  404. REGEX_INTERNAL_traverse_action action,
  405. void *action_cls)
  406. {
  407. struct REGEX_INTERNAL_Transition *t;
  408. if (GNUNET_YES == marks[s->traversal_id])
  409. return;
  410. marks[s->traversal_id] = GNUNET_YES;
  411. if (NULL != action)
  412. action (action_cls, *count, s);
  413. (*count)++;
  414. for (t = s->transitions_head; NULL != t; t = t->next)
  415. {
  416. if ((NULL == check) ||
  417. ((NULL != check) && (GNUNET_YES == check (check_cls, s, t)) ))
  418. {
  419. automaton_state_traverse (t->to_state,
  420. marks,
  421. count,
  422. check,
  423. check_cls,
  424. action,
  425. action_cls);
  426. }
  427. }
  428. }
  429. /**
  430. * Traverses the given automaton using depth-first-search (DFS) from it's start
  431. * state, visiting all reachable states and calling 'action' on each one of
  432. * them.
  433. *
  434. * @param a automaton to be traversed.
  435. * @param start start state, pass a->start or NULL to traverse the whole automaton.
  436. * @param check function that is checked before advancing on each transition
  437. * in the DFS.
  438. * @param check_cls closure for @a check.
  439. * @param action action to be performed on each state.
  440. * @param action_cls closure for @a action
  441. */
  442. void
  443. REGEX_INTERNAL_automaton_traverse (const struct REGEX_INTERNAL_Automaton *a,
  444. struct REGEX_INTERNAL_State *start,
  445. REGEX_INTERNAL_traverse_check check,
  446. void *check_cls,
  447. REGEX_INTERNAL_traverse_action action,
  448. void *action_cls)
  449. {
  450. unsigned int count;
  451. struct REGEX_INTERNAL_State *s;
  452. if ((NULL == a) || (0 == a->state_count))
  453. return;
  454. int marks[a->state_count];
  455. for (count = 0, s = a->states_head; NULL != s && count < a->state_count;
  456. s = s->next, count++)
  457. {
  458. s->traversal_id = count;
  459. marks[s->traversal_id] = GNUNET_NO;
  460. }
  461. count = 0;
  462. if (NULL == start)
  463. s = a->start;
  464. else
  465. s = start;
  466. automaton_state_traverse (s,
  467. marks,
  468. &count,
  469. check,
  470. check_cls,
  471. action,
  472. action_cls);
  473. }
  474. /**
  475. * String container for faster string operations.
  476. */
  477. struct StringBuffer
  478. {
  479. /**
  480. * Buffer holding the string (may start in the middle!);
  481. * NOT 0-terminated!
  482. */
  483. char *sbuf;
  484. /**
  485. * Allocated buffer.
  486. */
  487. char *abuf;
  488. /**
  489. * Length of the string in the buffer.
  490. */
  491. size_t slen;
  492. /**
  493. * Number of bytes allocated for @e sbuf
  494. */
  495. unsigned int blen;
  496. /**
  497. * Buffer currently represents "NULL" (not the empty string!)
  498. */
  499. int16_t null_flag;
  500. /**
  501. * If this entry is part of the last/current generation array,
  502. * this flag is #GNUNET_YES if the last and current generation are
  503. * identical (and thus copying is unnecessary if the value didn't
  504. * change). This is used in an optimization that improves
  505. * performance by about 1% --- if we use int16_t here. With just
  506. * "int" for both flags, performance drops (on my system) significantly,
  507. * most likely due to increased cache misses.
  508. */
  509. int16_t synced;
  510. };
  511. /**
  512. * Compare two strings for equality. If either is NULL they are not equal.
  513. *
  514. * @param s1 first string for comparison.
  515. * @param s2 second string for comparison.
  516. *
  517. * @return 0 if the strings are the same or both NULL, 1 or -1 if not.
  518. */
  519. static int
  520. sb_nullstrcmp (const struct StringBuffer *s1, const struct StringBuffer *s2)
  521. {
  522. if ((GNUNET_YES == s1->null_flag) && (GNUNET_YES == s2->null_flag))
  523. return 0;
  524. if ((GNUNET_YES == s1->null_flag) || (GNUNET_YES == s2->null_flag))
  525. return -1;
  526. if (s1->slen != s2->slen)
  527. return -1;
  528. if (0 == s1->slen)
  529. return 0;
  530. return memcmp (s1->sbuf, s2->sbuf, s1->slen);
  531. }
  532. /**
  533. * Compare two strings for equality.
  534. *
  535. * @param s1 first string for comparison.
  536. * @param s2 second string for comparison.
  537. *
  538. * @return 0 if the strings are the same, 1 or -1 if not.
  539. */
  540. static int
  541. sb_strcmp (const struct StringBuffer *s1, const struct StringBuffer *s2)
  542. {
  543. if (s1->slen != s2->slen)
  544. return -1;
  545. if (0 == s1->slen)
  546. return 0;
  547. return memcmp (s1->sbuf, s2->sbuf, s1->slen);
  548. }
  549. /**
  550. * Reallocate the buffer of 'ret' to fit 'nlen' characters;
  551. * move the existing string to the beginning of the new buffer.
  552. *
  553. * @param ret current buffer, to be updated
  554. * @param nlen target length for the buffer, must be at least ret->slen
  555. */
  556. static void
  557. sb_realloc (struct StringBuffer *ret, size_t nlen)
  558. {
  559. char *old;
  560. GNUNET_assert (nlen >= ret->slen);
  561. old = ret->abuf;
  562. ret->abuf = GNUNET_malloc (nlen);
  563. ret->blen = nlen;
  564. GNUNET_memcpy (ret->abuf, ret->sbuf, ret->slen);
  565. ret->sbuf = ret->abuf;
  566. GNUNET_free (old);
  567. }
  568. /**
  569. * Append a string.
  570. *
  571. * @param ret where to write the result
  572. * @param sarg string to append
  573. */
  574. static void
  575. sb_append (struct StringBuffer *ret, const struct StringBuffer *sarg)
  576. {
  577. if (GNUNET_YES == ret->null_flag)
  578. ret->slen = 0;
  579. ret->null_flag = GNUNET_NO;
  580. if (ret->blen < sarg->slen + ret->slen)
  581. sb_realloc (ret, ret->blen + sarg->slen + 128);
  582. GNUNET_memcpy (&ret->sbuf[ret->slen], sarg->sbuf, sarg->slen);
  583. ret->slen += sarg->slen;
  584. }
  585. /**
  586. * Append a C string.
  587. *
  588. * @param ret where to write the result
  589. * @param cstr string to append
  590. */
  591. static void
  592. sb_append_cstr (struct StringBuffer *ret, const char *cstr)
  593. {
  594. size_t cstr_len = strlen (cstr);
  595. if (GNUNET_YES == ret->null_flag)
  596. ret->slen = 0;
  597. ret->null_flag = GNUNET_NO;
  598. if (ret->blen < cstr_len + ret->slen)
  599. sb_realloc (ret, ret->blen + cstr_len + 128);
  600. GNUNET_memcpy (&ret->sbuf[ret->slen], cstr, cstr_len);
  601. ret->slen += cstr_len;
  602. }
  603. /**
  604. * Wrap a string buffer, that is, set ret to the format string
  605. * which contains an "%s" which is to be replaced with the original
  606. * content of 'ret'. Note that optimizing this function is not
  607. * really worth it, it is rarely called.
  608. *
  609. * @param ret where to write the result and take the input for %.*s from
  610. * @param format format string, fprintf-style, with exactly one "%.*s"
  611. * @param extra_chars how long will the result be, in addition to 'sarg' length
  612. */
  613. static void
  614. sb_wrap (struct StringBuffer *ret, const char *format, size_t extra_chars)
  615. {
  616. char *temp;
  617. if (GNUNET_YES == ret->null_flag)
  618. ret->slen = 0;
  619. ret->null_flag = GNUNET_NO;
  620. temp = GNUNET_malloc (ret->slen + extra_chars + 1);
  621. GNUNET_snprintf (temp,
  622. ret->slen + extra_chars + 1,
  623. format,
  624. (int) ret->slen,
  625. ret->sbuf);
  626. GNUNET_free (ret->abuf);
  627. ret->abuf = temp;
  628. ret->sbuf = temp;
  629. ret->blen = ret->slen + extra_chars + 1;
  630. ret->slen = ret->slen + extra_chars;
  631. }
  632. /**
  633. * Format a string buffer. Note that optimizing this function is not
  634. * really worth it, it is rarely called.
  635. *
  636. * @param ret where to write the result
  637. * @param format format string, fprintf-style, with exactly one "%.*s"
  638. * @param extra_chars how long will the result be, in addition to 'sarg' length
  639. * @param sarg string to print into the format
  640. */
  641. static void
  642. sb_printf1 (struct StringBuffer *ret,
  643. const char *format,
  644. size_t extra_chars,
  645. const struct StringBuffer *sarg)
  646. {
  647. if (ret->blen < sarg->slen + extra_chars + 1)
  648. sb_realloc (ret, sarg->slen + extra_chars + 1);
  649. ret->null_flag = GNUNET_NO;
  650. ret->sbuf = ret->abuf;
  651. ret->slen = sarg->slen + extra_chars;
  652. GNUNET_snprintf (ret->sbuf, ret->blen, format, (int) sarg->slen, sarg->sbuf);
  653. }
  654. /**
  655. * Format a string buffer.
  656. *
  657. * @param ret where to write the result
  658. * @param format format string, fprintf-style, with exactly two "%.*s"
  659. * @param extra_chars how long will the result be, in addition to 'sarg1/2' length
  660. * @param sarg1 first string to print into the format
  661. * @param sarg2 second string to print into the format
  662. */
  663. static void
  664. sb_printf2 (struct StringBuffer *ret,
  665. const char *format,
  666. size_t extra_chars,
  667. const struct StringBuffer *sarg1,
  668. const struct StringBuffer *sarg2)
  669. {
  670. if (ret->blen < sarg1->slen + sarg2->slen + extra_chars + 1)
  671. sb_realloc (ret, sarg1->slen + sarg2->slen + extra_chars + 1);
  672. ret->null_flag = GNUNET_NO;
  673. ret->slen = sarg1->slen + sarg2->slen + extra_chars;
  674. ret->sbuf = ret->abuf;
  675. GNUNET_snprintf (ret->sbuf,
  676. ret->blen,
  677. format,
  678. (int) sarg1->slen,
  679. sarg1->sbuf,
  680. (int) sarg2->slen,
  681. sarg2->sbuf);
  682. }
  683. /**
  684. * Format a string buffer. Note that optimizing this function is not
  685. * really worth it, it is rarely called.
  686. *
  687. * @param ret where to write the result
  688. * @param format format string, fprintf-style, with exactly three "%.*s"
  689. * @param extra_chars how long will the result be, in addition to 'sarg1/2/3' length
  690. * @param sarg1 first string to print into the format
  691. * @param sarg2 second string to print into the format
  692. * @param sarg3 third string to print into the format
  693. */
  694. static void
  695. sb_printf3 (struct StringBuffer *ret,
  696. const char *format,
  697. size_t extra_chars,
  698. const struct StringBuffer *sarg1,
  699. const struct StringBuffer *sarg2,
  700. const struct StringBuffer *sarg3)
  701. {
  702. if (ret->blen < sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1)
  703. sb_realloc (ret, sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1);
  704. ret->null_flag = GNUNET_NO;
  705. ret->slen = sarg1->slen + sarg2->slen + sarg3->slen + extra_chars;
  706. ret->sbuf = ret->abuf;
  707. GNUNET_snprintf (ret->sbuf,
  708. ret->blen,
  709. format,
  710. (int) sarg1->slen,
  711. sarg1->sbuf,
  712. (int) sarg2->slen,
  713. sarg2->sbuf,
  714. (int) sarg3->slen,
  715. sarg3->sbuf);
  716. }
  717. /**
  718. * Free resources of the given string buffer.
  719. *
  720. * @param sb buffer to free (actual pointer is not freed, as they
  721. * should not be individually allocated)
  722. */
  723. static void
  724. sb_free (struct StringBuffer *sb)
  725. {
  726. GNUNET_array_grow (sb->abuf, sb->blen, 0);
  727. sb->slen = 0;
  728. sb->sbuf = NULL;
  729. sb->null_flag = GNUNET_YES;
  730. }
  731. /**
  732. * Copy the given string buffer from 'in' to 'out'.
  733. *
  734. * @param in input string
  735. * @param out output string
  736. */
  737. static void
  738. sb_strdup (struct StringBuffer *out, const struct StringBuffer *in)
  739. {
  740. out->null_flag = in->null_flag;
  741. if (GNUNET_YES == out->null_flag)
  742. return;
  743. if (out->blen < in->slen)
  744. {
  745. GNUNET_array_grow (out->abuf, out->blen, in->slen);
  746. }
  747. out->sbuf = out->abuf;
  748. out->slen = in->slen;
  749. GNUNET_memcpy (out->sbuf, in->sbuf, out->slen);
  750. }
  751. /**
  752. * Copy the given string buffer from 'in' to 'out'.
  753. *
  754. * @param cstr input string
  755. * @param out output string
  756. */
  757. static void
  758. sb_strdup_cstr (struct StringBuffer *out, const char *cstr)
  759. {
  760. if (NULL == cstr)
  761. {
  762. out->null_flag = GNUNET_YES;
  763. return;
  764. }
  765. out->null_flag = GNUNET_NO;
  766. out->slen = strlen (cstr);
  767. if (out->blen < out->slen)
  768. {
  769. GNUNET_array_grow (out->abuf, out->blen, out->slen);
  770. }
  771. out->sbuf = out->abuf;
  772. GNUNET_memcpy (out->sbuf, cstr, out->slen);
  773. }
  774. /**
  775. * Check if the given string @a str needs parentheses around it when
  776. * using it to generate a regex.
  777. *
  778. * @param str string
  779. *
  780. * @return #GNUNET_YES if parentheses are needed, #GNUNET_NO otherwise
  781. */
  782. static int
  783. needs_parentheses (const struct StringBuffer *str)
  784. {
  785. size_t slen;
  786. const char *op;
  787. const char *cl;
  788. const char *pos;
  789. const char *end;
  790. unsigned int cnt;
  791. if ((GNUNET_YES == str->null_flag) || ((slen = str->slen) < 2))
  792. return GNUNET_NO;
  793. pos = str->sbuf;
  794. if ('(' != pos[0])
  795. return GNUNET_YES;
  796. end = str->sbuf + slen;
  797. cnt = 1;
  798. pos++;
  799. while (cnt > 0)
  800. {
  801. cl = memchr (pos, ')', end - pos);
  802. if (NULL == cl)
  803. {
  804. GNUNET_break (0);
  805. return GNUNET_YES;
  806. }
  807. /* while '(' before ')', count opening parens */
  808. while ((NULL != (op = memchr (pos, '(', end - pos))) && (op < cl))
  809. {
  810. cnt++;
  811. pos = op + 1;
  812. }
  813. /* got ')' first */
  814. cnt--;
  815. pos = cl + 1;
  816. }
  817. return (*pos == '\0') ? GNUNET_NO : GNUNET_YES;
  818. }
  819. /**
  820. * Remove parentheses surrounding string @a str.
  821. * Example: "(a)" becomes "a", "(a|b)|(a|c)" stays the same.
  822. * You need to #GNUNET_free() the returned string.
  823. *
  824. * @param str string, modified to contain a
  825. * @return string without surrounding parentheses, string 'str' if no preceding
  826. * epsilon could be found, NULL if 'str' was NULL
  827. */
  828. static void
  829. remove_parentheses (struct StringBuffer *str)
  830. {
  831. size_t slen;
  832. const char *pos;
  833. const char *end;
  834. const char *sbuf;
  835. const char *op;
  836. const char *cp;
  837. unsigned int cnt;
  838. if (0)
  839. return;
  840. sbuf = str->sbuf;
  841. if ((GNUNET_YES == str->null_flag) || (1 >= (slen = str->slen)) ||
  842. ('(' != str->sbuf[0]) || (')' != str->sbuf[slen - 1]))
  843. return;
  844. cnt = 0;
  845. pos = &sbuf[1];
  846. end = &sbuf[slen - 1];
  847. op = memchr (pos, '(', end - pos);
  848. cp = memchr (pos, ')', end - pos);
  849. while (NULL != cp)
  850. {
  851. while ((NULL != op) && (op < cp))
  852. {
  853. cnt++;
  854. pos = op + 1;
  855. op = memchr (pos, '(', end - pos);
  856. }
  857. while ((NULL != cp) && ((NULL == op) || (cp < op)))
  858. {
  859. if (0 == cnt)
  860. return; /* can't strip parens */
  861. cnt--;
  862. pos = cp + 1;
  863. cp = memchr (pos, ')', end - pos);
  864. }
  865. }
  866. if (0 != cnt)
  867. {
  868. GNUNET_break (0);
  869. return;
  870. }
  871. str->sbuf++;
  872. str->slen -= 2;
  873. }
  874. /**
  875. * Check if the string 'str' starts with an epsilon (empty string).
  876. * Example: "(|a)" is starting with an epsilon.
  877. *
  878. * @param str string to test
  879. *
  880. * @return 0 if str has no epsilon, 1 if str starts with '(|' and ends with ')'
  881. */
  882. static int
  883. has_epsilon (const struct StringBuffer *str)
  884. {
  885. return (GNUNET_YES != str->null_flag) && (0 < str->slen) &&
  886. ('(' == str->sbuf[0]) && ('|' == str->sbuf[1]) &&
  887. (')' == str->sbuf[str->slen - 1]);
  888. }
  889. /**
  890. * Remove an epsilon from the string str. Where epsilon is an empty string
  891. * Example: str = "(|a|b|c)", result: "a|b|c"
  892. * The returned string needs to be freed.
  893. *
  894. * @param str original string
  895. * @param ret where to return string without preceding epsilon, string 'str' if no preceding
  896. * epsilon could be found, NULL if 'str' was NULL
  897. */
  898. static void
  899. remove_epsilon (const struct StringBuffer *str, struct StringBuffer *ret)
  900. {
  901. if (GNUNET_YES == str->null_flag)
  902. {
  903. ret->null_flag = GNUNET_YES;
  904. return;
  905. }
  906. if ((str->slen > 1) && ('(' == str->sbuf[0]) && ('|' == str->sbuf[1]) &&
  907. (')' == str->sbuf[str->slen - 1]))
  908. {
  909. /* remove epsilon */
  910. if (ret->blen < str->slen - 3)
  911. {
  912. GNUNET_array_grow (ret->abuf, ret->blen, str->slen - 3);
  913. }
  914. ret->sbuf = ret->abuf;
  915. ret->slen = str->slen - 3;
  916. GNUNET_memcpy (ret->sbuf, &str->sbuf[2], ret->slen);
  917. return;
  918. }
  919. sb_strdup (ret, str);
  920. }
  921. /**
  922. * Compare n bytes of 'str1' and 'str2'
  923. *
  924. * @param str1 first string to compare
  925. * @param str2 second string for comparison
  926. * @param n number of bytes to compare
  927. *
  928. * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
  929. */
  930. static int
  931. sb_strncmp (const struct StringBuffer *str1,
  932. const struct StringBuffer *str2,
  933. size_t n)
  934. {
  935. size_t max;
  936. if ((str1->slen != str2->slen) && ((str1->slen < n) || (str2->slen < n)))
  937. return -1;
  938. max = GNUNET_MAX (str1->slen, str2->slen);
  939. if (max > n)
  940. max = n;
  941. return memcmp (str1->sbuf, str2->sbuf, max);
  942. }
  943. /**
  944. * Compare n bytes of 'str1' and 'str2'
  945. *
  946. * @param str1 first string to compare
  947. * @param str2 second C string for comparison
  948. * @param n number of bytes to compare (and length of str2)
  949. *
  950. * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
  951. */
  952. static int
  953. sb_strncmp_cstr (const struct StringBuffer *str1, const char *str2, size_t n)
  954. {
  955. if (str1->slen < n)
  956. return -1;
  957. return memcmp (str1->sbuf, str2, n);
  958. }
  959. /**
  960. * Initialize string buffer for storing strings of up to n
  961. * characters.
  962. *
  963. * @param sb buffer to initialize
  964. * @param n desired target length
  965. */
  966. static void
  967. sb_init (struct StringBuffer *sb, size_t n)
  968. {
  969. sb->null_flag = GNUNET_NO;
  970. sb->abuf = sb->sbuf = (0 == n) ? NULL : GNUNET_malloc (n);
  971. sb->blen = n;
  972. sb->slen = 0;
  973. }
  974. /**
  975. * Compare 'str1', starting from position 'k', with whole 'str2'
  976. *
  977. * @param str1 first string to compare, starting from position 'k'
  978. * @param str2 second string for comparison
  979. * @param k starting position in 'str1'
  980. *
  981. * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise
  982. */
  983. static int
  984. sb_strkcmp (const struct StringBuffer *str1,
  985. const struct StringBuffer *str2,
  986. size_t k)
  987. {
  988. if ((GNUNET_YES == str1->null_flag) || (GNUNET_YES == str2->null_flag) ||
  989. (k > str1->slen) || (str1->slen - k != str2->slen))
  990. return -1;
  991. return memcmp (&str1->sbuf[k], str2->sbuf, str2->slen);
  992. }
  993. /**
  994. * Helper function used as 'action' in 'REGEX_INTERNAL_automaton_traverse'
  995. * function to create the depth-first numbering of the states.
  996. *
  997. * @param cls states array.
  998. * @param count current state counter.
  999. * @param s current state.
  1000. */
  1001. static void
  1002. number_states (void *cls,
  1003. const unsigned int count,
  1004. struct REGEX_INTERNAL_State *s)
  1005. {
  1006. struct REGEX_INTERNAL_State **states = cls;
  1007. s->dfs_id = count;
  1008. if (NULL != states)
  1009. states[count] = s;
  1010. }
  1011. #define PRIS(a) \
  1012. ((GNUNET_YES == a.null_flag) ? 6 : (int) a.slen), \
  1013. ((GNUNET_YES == a.null_flag) ? "(null)" : a.sbuf)
  1014. /**
  1015. * Construct the regular expression given the inductive step,
  1016. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^*
  1017. * R^{(k-1)}_{kj}, and simplify the resulting expression saved in R_cur_ij.
  1018. *
  1019. * @param R_last_ij value of $R^{(k-1)_{ij}.
  1020. * @param R_last_ik value of $R^{(k-1)_{ik}.
  1021. * @param R_last_kk value of $R^{(k-1)_{kk}.
  1022. * @param R_last_kj value of $R^{(k-1)_{kj}.
  1023. * @param R_cur_ij result for this inductive step is saved in R_cur_ij, R_cur_ij
  1024. * is expected to be NULL when called!
  1025. * @param R_cur_l optimization -- kept between iterations to avoid realloc
  1026. * @param R_cur_r optimization -- kept between iterations to avoid realloc
  1027. */
  1028. static void
  1029. automaton_create_proofs_simplify (const struct StringBuffer *R_last_ij,
  1030. const struct StringBuffer *R_last_ik,
  1031. const struct StringBuffer *R_last_kk,
  1032. const struct StringBuffer *R_last_kj,
  1033. struct StringBuffer *R_cur_ij,
  1034. struct StringBuffer *R_cur_l,
  1035. struct StringBuffer *R_cur_r)
  1036. {
  1037. struct StringBuffer R_temp_ij;
  1038. struct StringBuffer R_temp_ik;
  1039. struct StringBuffer R_temp_kj;
  1040. struct StringBuffer R_temp_kk;
  1041. int eps_check;
  1042. int ij_ik_cmp;
  1043. int ij_kj_cmp;
  1044. int ik_kk_cmp;
  1045. int kk_kj_cmp;
  1046. int clean_ik_kk_cmp;
  1047. int clean_kk_kj_cmp;
  1048. size_t length;
  1049. size_t length_l;
  1050. size_t length_r;
  1051. /*
  1052. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
  1053. * R_last == R^{(k-1)}, R_cur == R^{(k)}
  1054. * R_cur_ij = R_cur_l | R_cur_r
  1055. * R_cur_l == R^{(k-1)}_{ij}
  1056. * R_cur_r == R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
  1057. */if ((GNUNET_YES == R_last_ij->null_flag) &&
  1058. ((GNUNET_YES == R_last_ik->null_flag) ||
  1059. (GNUNET_YES == R_last_kj->null_flag)))
  1060. {
  1061. /* R^{(k)}_{ij} = N | N */
  1062. R_cur_ij->null_flag = GNUNET_YES;
  1063. R_cur_ij->synced = GNUNET_NO;
  1064. return;
  1065. }
  1066. if ((GNUNET_YES == R_last_ik->null_flag) ||
  1067. (GNUNET_YES == R_last_kj->null_flag))
  1068. {
  1069. /* R^{(k)}_{ij} = R^{(k-1)}_{ij} | N */
  1070. if (GNUNET_YES == R_last_ij->synced)
  1071. {
  1072. R_cur_ij->synced = GNUNET_YES;
  1073. R_cur_ij->null_flag = GNUNET_NO;
  1074. return;
  1075. }
  1076. R_cur_ij->synced = GNUNET_YES;
  1077. sb_strdup (R_cur_ij, R_last_ij);
  1078. return;
  1079. }
  1080. R_cur_ij->synced = GNUNET_NO;
  1081. /* $R^{(k)}_{ij} = N | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} OR
  1082. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} */
  1083. R_cur_r->null_flag = GNUNET_YES;
  1084. R_cur_r->slen = 0;
  1085. R_cur_l->null_flag = GNUNET_YES;
  1086. R_cur_l->slen = 0;
  1087. /* cache results from strcmp, we might need these many times */
  1088. ij_kj_cmp = sb_nullstrcmp (R_last_ij, R_last_kj);
  1089. ij_ik_cmp = sb_nullstrcmp (R_last_ij, R_last_ik);
  1090. ik_kk_cmp = sb_nullstrcmp (R_last_ik, R_last_kk);
  1091. kk_kj_cmp = sb_nullstrcmp (R_last_kk, R_last_kj);
  1092. /* Assign R_temp_(ik|kk|kj) to R_last[][] and remove epsilon as well
  1093. * as parentheses, so we can better compare the contents */
  1094. memset (&R_temp_ij, 0, sizeof(struct StringBuffer));
  1095. memset (&R_temp_ik, 0, sizeof(struct StringBuffer));
  1096. memset (&R_temp_kk, 0, sizeof(struct StringBuffer));
  1097. memset (&R_temp_kj, 0, sizeof(struct StringBuffer));
  1098. remove_epsilon (R_last_ik, &R_temp_ik);
  1099. remove_epsilon (R_last_kk, &R_temp_kk);
  1100. remove_epsilon (R_last_kj, &R_temp_kj);
  1101. remove_parentheses (&R_temp_ik);
  1102. remove_parentheses (&R_temp_kk);
  1103. remove_parentheses (&R_temp_kj);
  1104. clean_ik_kk_cmp = sb_nullstrcmp (R_last_ik, &R_temp_kk);
  1105. clean_kk_kj_cmp = sb_nullstrcmp (&R_temp_kk, R_last_kj);
  1106. /* construct R_cur_l (and, if necessary R_cur_r) */
  1107. if (GNUNET_YES != R_last_ij->null_flag)
  1108. {
  1109. /* Assign R_temp_ij to R_last_ij and remove epsilon as well
  1110. * as parentheses, so we can better compare the contents */
  1111. remove_epsilon (R_last_ij, &R_temp_ij);
  1112. remove_parentheses (&R_temp_ij);
  1113. if ((0 == sb_strcmp (&R_temp_ij, &R_temp_ik)) &&
  1114. (0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) &&
  1115. (0 == sb_strcmp (&R_temp_kk, &R_temp_kj)))
  1116. {
  1117. if (0 == R_temp_ij.slen)
  1118. {
  1119. R_cur_r->null_flag = GNUNET_NO;
  1120. }
  1121. else if ((0 == sb_strncmp_cstr (R_last_ij, "(|", 2)) ||
  1122. ((0 == sb_strncmp_cstr (R_last_ik, "(|", 2)) &&
  1123. (0 == sb_strncmp_cstr (R_last_kj, "(|", 2)) ))
  1124. {
  1125. /*
  1126. * a|(e|a)a*(e|a) = a*
  1127. * a|(e|a)(e|a)*(e|a) = a*
  1128. * (e|a)|aa*a = a*
  1129. * (e|a)|aa*(e|a) = a*
  1130. * (e|a)|(e|a)a*a = a*
  1131. * (e|a)|(e|a)a*(e|a) = a*
  1132. * (e|a)|(e|a)(e|a)*(e|a) = a*
  1133. */if (GNUNET_YES == needs_parentheses (&R_temp_ij))
  1134. sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_ij);
  1135. else
  1136. sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_ij);
  1137. }
  1138. else
  1139. {
  1140. /*
  1141. * a|aa*a = a+
  1142. * a|(e|a)a*a = a+
  1143. * a|aa*(e|a) = a+
  1144. * a|(e|a)(e|a)*a = a+
  1145. * a|a(e|a)*(e|a) = a+
  1146. */if (GNUNET_YES == needs_parentheses (&R_temp_ij))
  1147. sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_ij);
  1148. else
  1149. sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_ij);
  1150. }
  1151. }
  1152. else if ((0 == ij_ik_cmp) && (0 == clean_kk_kj_cmp) &&
  1153. (0 != clean_ik_kk_cmp))
  1154. {
  1155. /* a|ab*b = ab* */
  1156. if (0 == R_last_kk->slen)
  1157. sb_strdup (R_cur_r, R_last_ij);
  1158. else if (GNUNET_YES == needs_parentheses (&R_temp_kk))
  1159. sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk);
  1160. else
  1161. sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, R_last_kk);
  1162. R_cur_l->null_flag = GNUNET_YES;
  1163. }
  1164. else if ((0 == ij_kj_cmp) && (0 == clean_ik_kk_cmp) &&
  1165. (0 != clean_kk_kj_cmp))
  1166. {
  1167. /* a|bb*a = b*a */
  1168. if (R_last_kk->slen < 1)
  1169. {
  1170. sb_strdup (R_cur_r, R_last_kj);
  1171. }
  1172. else if (GNUNET_YES == needs_parentheses (&R_temp_kk))
  1173. sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj);
  1174. else
  1175. sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj);
  1176. R_cur_l->null_flag = GNUNET_YES;
  1177. }
  1178. else if ((0 == ij_ik_cmp) && (0 == kk_kj_cmp) &&
  1179. (! has_epsilon (R_last_ij)) && has_epsilon (R_last_kk))
  1180. {
  1181. /* a|a(e|b)*(e|b) = a|ab* = a|a|ab|abb|abbb|... = ab* */
  1182. if (needs_parentheses (&R_temp_kk))
  1183. sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk);
  1184. else
  1185. sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, &R_temp_kk);
  1186. R_cur_l->null_flag = GNUNET_YES;
  1187. }
  1188. else if ((0 == ij_kj_cmp) && (0 == ik_kk_cmp) &&
  1189. (! has_epsilon (R_last_ij)) && has_epsilon (R_last_kk))
  1190. {
  1191. /* a|(e|b)(e|b)*a = a|b*a = a|a|ba|bba|bbba|... = b*a */
  1192. if (needs_parentheses (&R_temp_kk))
  1193. sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_ij);
  1194. else
  1195. sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_ij);
  1196. R_cur_l->null_flag = GNUNET_YES;
  1197. }
  1198. else
  1199. {
  1200. sb_strdup (R_cur_l, R_last_ij);
  1201. remove_parentheses (R_cur_l);
  1202. }
  1203. }
  1204. else
  1205. {
  1206. /* we have no left side */
  1207. R_cur_l->null_flag = GNUNET_YES;
  1208. }
  1209. /* construct R_cur_r, if not already constructed */
  1210. if (GNUNET_YES == R_cur_r->null_flag)
  1211. {
  1212. length = R_temp_kk.slen - R_last_ik->slen;
  1213. /* a(ba)*bx = (ab)+x */
  1214. if ((length > 0) && (GNUNET_YES != R_last_kk->null_flag) &&
  1215. (0 < R_last_kk->slen) && (GNUNET_YES != R_last_kj->null_flag) &&
  1216. (0 < R_last_kj->slen) && (GNUNET_YES != R_last_ik->null_flag) &&
  1217. (0 < R_last_ik->slen) &&
  1218. (0 == sb_strkcmp (&R_temp_kk, R_last_ik, length)) &&
  1219. (0 == sb_strncmp (&R_temp_kk, R_last_kj, length)))
  1220. {
  1221. struct StringBuffer temp_a;
  1222. struct StringBuffer temp_b;
  1223. sb_init (&temp_a, length);
  1224. sb_init (&temp_b, R_last_kj->slen - length);
  1225. length_l = length;
  1226. temp_a.sbuf = temp_a.abuf;
  1227. GNUNET_memcpy (temp_a.sbuf, R_last_kj->sbuf, length_l);
  1228. temp_a.slen = length_l;
  1229. length_r = R_last_kj->slen - length;
  1230. temp_b.sbuf = temp_b.abuf;
  1231. GNUNET_memcpy (temp_b.sbuf, &R_last_kj->sbuf[length], length_r);
  1232. temp_b.slen = length_r;
  1233. /* e|(ab)+ = (ab)* */
  1234. if ((GNUNET_YES != R_cur_l->null_flag) && (0 == R_cur_l->slen) &&
  1235. (0 == temp_b.slen))
  1236. {
  1237. sb_printf2 (R_cur_r, "(%.*s%.*s)*", 3, R_last_ik, &temp_a);
  1238. sb_free (R_cur_l);
  1239. R_cur_l->null_flag = GNUNET_YES;
  1240. }
  1241. else
  1242. {
  1243. sb_printf3 (R_cur_r, "(%.*s%.*s)+%.*s", 3, R_last_ik, &temp_a, &temp_b);
  1244. }
  1245. sb_free (&temp_a);
  1246. sb_free (&temp_b);
  1247. }
  1248. else if ((0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) &&
  1249. (0 == sb_strcmp (&R_temp_kk, &R_temp_kj)) )
  1250. {
  1251. /*
  1252. * (e|a)a*(e|a) = a*
  1253. * (e|a)(e|a)*(e|a) = a*
  1254. */
  1255. if (has_epsilon (R_last_ik) && has_epsilon (R_last_kj))
  1256. {
  1257. if (needs_parentheses (&R_temp_kk))
  1258. sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_kk);
  1259. else
  1260. sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_kk);
  1261. }
  1262. /* aa*a = a+a */
  1263. else if ((0 == clean_ik_kk_cmp) && (0 == clean_kk_kj_cmp) &&
  1264. (! has_epsilon (R_last_ik)))
  1265. {
  1266. if (needs_parentheses (&R_temp_kk))
  1267. sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, &R_temp_kk);
  1268. else
  1269. sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, &R_temp_kk);
  1270. }
  1271. /*
  1272. * (e|a)a*a = a+
  1273. * aa*(e|a) = a+
  1274. * a(e|a)*(e|a) = a+
  1275. * (e|a)a*a = a+
  1276. */else
  1277. {
  1278. eps_check = (has_epsilon (R_last_ik) + has_epsilon (R_last_kk)
  1279. + has_epsilon (R_last_kj));
  1280. if (1 == eps_check)
  1281. {
  1282. if (needs_parentheses (&R_temp_kk))
  1283. sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_kk);
  1284. else
  1285. sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_kk);
  1286. }
  1287. }
  1288. }
  1289. /*
  1290. * aa*b = a+b
  1291. * (e|a)(e|a)*b = a*b
  1292. */
  1293. else if (0 == sb_strcmp (&R_temp_ik, &R_temp_kk))
  1294. {
  1295. if (has_epsilon (R_last_ik))
  1296. {
  1297. if (needs_parentheses (&R_temp_kk))
  1298. sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj);
  1299. else
  1300. sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj);
  1301. }
  1302. else
  1303. {
  1304. if (needs_parentheses (&R_temp_kk))
  1305. sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, R_last_kj);
  1306. else
  1307. sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, R_last_kj);
  1308. }
  1309. }
  1310. /*
  1311. * ba*a = ba+
  1312. * b(e|a)*(e|a) = ba*
  1313. */
  1314. else if (0 == sb_strcmp (&R_temp_kk, &R_temp_kj))
  1315. {
  1316. if (has_epsilon (R_last_kj))
  1317. {
  1318. if (needs_parentheses (&R_temp_kk))
  1319. sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ik, &R_temp_kk);
  1320. else
  1321. sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ik, &R_temp_kk);
  1322. }
  1323. else
  1324. {
  1325. if (needs_parentheses (&R_temp_kk))
  1326. sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, R_last_ik, &R_temp_kk);
  1327. else
  1328. sb_printf2 (R_cur_r, "%.*s+%.*s", 1, R_last_ik, &R_temp_kk);
  1329. }
  1330. }
  1331. else
  1332. {
  1333. if (0 < R_temp_kk.slen)
  1334. {
  1335. if (needs_parentheses (&R_temp_kk))
  1336. {
  1337. sb_printf3 (R_cur_r,
  1338. "%.*s(%.*s)*%.*s",
  1339. 3,
  1340. R_last_ik,
  1341. &R_temp_kk,
  1342. R_last_kj);
  1343. }
  1344. else
  1345. {
  1346. sb_printf3 (R_cur_r,
  1347. "%.*s%.*s*%.*s",
  1348. 1,
  1349. R_last_ik,
  1350. &R_temp_kk,
  1351. R_last_kj);
  1352. }
  1353. }
  1354. else
  1355. {
  1356. sb_printf2 (R_cur_r, "%.*s%.*s", 0, R_last_ik, R_last_kj);
  1357. }
  1358. }
  1359. }
  1360. sb_free (&R_temp_ij);
  1361. sb_free (&R_temp_ik);
  1362. sb_free (&R_temp_kk);
  1363. sb_free (&R_temp_kj);
  1364. if ((GNUNET_YES == R_cur_l->null_flag) && (GNUNET_YES == R_cur_r->null_flag))
  1365. {
  1366. R_cur_ij->null_flag = GNUNET_YES;
  1367. return;
  1368. }
  1369. if ((GNUNET_YES != R_cur_l->null_flag) && (GNUNET_YES == R_cur_r->null_flag))
  1370. {
  1371. struct StringBuffer tmp;
  1372. tmp = *R_cur_ij;
  1373. *R_cur_ij = *R_cur_l;
  1374. *R_cur_l = tmp;
  1375. return;
  1376. }
  1377. if ((GNUNET_YES == R_cur_l->null_flag) && (GNUNET_YES != R_cur_r->null_flag))
  1378. {
  1379. struct StringBuffer tmp;
  1380. tmp = *R_cur_ij;
  1381. *R_cur_ij = *R_cur_r;
  1382. *R_cur_r = tmp;
  1383. return;
  1384. }
  1385. if (0 == sb_nullstrcmp (R_cur_l, R_cur_r))
  1386. {
  1387. struct StringBuffer tmp;
  1388. tmp = *R_cur_ij;
  1389. *R_cur_ij = *R_cur_l;
  1390. *R_cur_l = tmp;
  1391. return;
  1392. }
  1393. sb_printf2 (R_cur_ij, "(%.*s|%.*s)", 3, R_cur_l, R_cur_r);
  1394. }
  1395. /**
  1396. * Create proofs for all states in the given automaton. Implementation of the
  1397. * algorithm described in chapter 3.2.1 of "Automata Theory, Languages, and
  1398. * Computation 3rd Edition" by Hopcroft, Motwani and Ullman.
  1399. *
  1400. * Each state in the automaton gets assigned 'proof' and 'hash' (hash of the
  1401. * proof) fields. The starting state will only have a valid proof/hash if it has
  1402. * any incoming transitions.
  1403. *
  1404. * @param a automaton for which to assign proofs and hashes, must not be NULL
  1405. */
  1406. static int
  1407. automaton_create_proofs (struct REGEX_INTERNAL_Automaton *a)
  1408. {
  1409. unsigned int n = a->state_count;
  1410. struct REGEX_INTERNAL_State *states[n];
  1411. struct StringBuffer *R_last;
  1412. struct StringBuffer *R_cur;
  1413. struct StringBuffer R_cur_r;
  1414. struct StringBuffer R_cur_l;
  1415. struct StringBuffer *R_swap;
  1416. struct REGEX_INTERNAL_Transition *t;
  1417. struct StringBuffer complete_regex;
  1418. unsigned int i;
  1419. unsigned int j;
  1420. unsigned int k;
  1421. R_last = GNUNET_malloc_large (sizeof(struct StringBuffer) * n * n);
  1422. R_cur = GNUNET_malloc_large (sizeof(struct StringBuffer) * n * n);
  1423. if ((NULL == R_last) || (NULL == R_cur))
  1424. {
  1425. GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc");
  1426. GNUNET_free (R_cur);
  1427. GNUNET_free (R_last);
  1428. return GNUNET_SYSERR;
  1429. }
  1430. /* create depth-first numbering of the states, initializes 'state' */
  1431. REGEX_INTERNAL_automaton_traverse (a,
  1432. a->start,
  1433. NULL,
  1434. NULL,
  1435. &number_states,
  1436. states);
  1437. for (i = 0; i < n; i++)
  1438. GNUNET_assert (NULL != states[i]);
  1439. for (i = 0; i < n; i++)
  1440. for (j = 0; j < n; j++)
  1441. R_last[i * n + j].null_flag = GNUNET_YES;
  1442. /* Compute regular expressions of length "1" between each pair of states */
  1443. for (i = 0; i < n; i++)
  1444. {
  1445. for (t = states[i]->transitions_head; NULL != t; t = t->next)
  1446. {
  1447. j = t->to_state->dfs_id;
  1448. if (GNUNET_YES == R_last[i * n + j].null_flag)
  1449. {
  1450. sb_strdup_cstr (&R_last[i * n + j], t->label);
  1451. }
  1452. else
  1453. {
  1454. sb_append_cstr (&R_last[i * n + j], "|");
  1455. sb_append_cstr (&R_last[i * n + j], t->label);
  1456. }
  1457. }
  1458. /* add self-loop: i is reachable from i via epsilon-transition */
  1459. if (GNUNET_YES == R_last[i * n + i].null_flag)
  1460. {
  1461. R_last[i * n + i].slen = 0;
  1462. R_last[i * n + i].null_flag = GNUNET_NO;
  1463. }
  1464. else
  1465. {
  1466. sb_wrap (&R_last[i * n + i], "(|%.*s)", 3);
  1467. }
  1468. }
  1469. for (i = 0; i < n; i++)
  1470. for (j = 0; j < n; j++)
  1471. if (needs_parentheses (&R_last[i * n + j]))
  1472. sb_wrap (&R_last[i * n + j], "(%.*s)", 2);
  1473. /* Compute regular expressions of length "k" between each pair of states per
  1474. * induction */
  1475. memset (&R_cur_l, 0, sizeof(struct StringBuffer));
  1476. memset (&R_cur_r, 0, sizeof(struct StringBuffer));
  1477. for (k = 0; k < n; k++)
  1478. {
  1479. for (i = 0; i < n; i++)
  1480. {
  1481. for (j = 0; j < n; j++)
  1482. {
  1483. /* Basis for the recursion:
  1484. * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj}
  1485. * R_last == R^{(k-1)}, R_cur == R^{(k)}
  1486. */
  1487. /* Create R_cur[i][j] and simplify the expression */
  1488. automaton_create_proofs_simplify (&R_last[i * n + j],
  1489. &R_last[i * n + k],
  1490. &R_last[k * n + k],
  1491. &R_last[k * n + j],
  1492. &R_cur[i * n + j],
  1493. &R_cur_l,
  1494. &R_cur_r);
  1495. }
  1496. }
  1497. /* set R_last = R_cur */
  1498. R_swap = R_last;
  1499. R_last = R_cur;
  1500. R_cur = R_swap;
  1501. /* clear 'R_cur' for next iteration */
  1502. for (i = 0; i < n; i++)
  1503. for (j = 0; j < n; j++)
  1504. R_cur[i * n + j].null_flag = GNUNET_YES;
  1505. }
  1506. sb_free (&R_cur_l);
  1507. sb_free (&R_cur_r);
  1508. /* assign proofs and hashes */
  1509. for (i = 0; i < n; i++)
  1510. {
  1511. if (GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag)
  1512. {
  1513. states[i]->proof = GNUNET_strndup (R_last[a->start->dfs_id * n + i].sbuf,
  1514. R_last[a->start->dfs_id * n + i].slen);
  1515. GNUNET_CRYPTO_hash (states[i]->proof,
  1516. strlen (states[i]->proof),
  1517. &states[i]->hash);
  1518. }
  1519. }
  1520. /* complete regex for whole DFA: union of all pairs (start state/accepting
  1521. * state(s)). */
  1522. sb_init (&complete_regex, 16 * n);
  1523. for (i = 0; i < n; i++)
  1524. {
  1525. if (states[i]->accepting)
  1526. {
  1527. if ((0 == complete_regex.slen) &&
  1528. (0 < R_last[a->start->dfs_id * n + i].slen))
  1529. {
  1530. sb_append (&complete_regex, &R_last[a->start->dfs_id * n + i]);
  1531. }
  1532. else if ((GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag) &&
  1533. (0 < R_last[a->start->dfs_id * n + i].slen))
  1534. {
  1535. sb_append_cstr (&complete_regex, "|");
  1536. sb_append (&complete_regex, &R_last[a->start->dfs_id * n + i]);
  1537. }
  1538. }
  1539. }
  1540. a->canonical_regex =
  1541. GNUNET_strndup (complete_regex.sbuf, complete_regex.slen);
  1542. /* cleanup */
  1543. sb_free (&complete_regex);
  1544. for (i = 0; i < n; i++)
  1545. for (j = 0; j < n; j++)
  1546. {
  1547. sb_free (&R_cur[i * n + j]);
  1548. sb_free (&R_last[i * n + j]);
  1549. }
  1550. GNUNET_free (R_cur);
  1551. GNUNET_free (R_last);
  1552. return GNUNET_OK;
  1553. }
  1554. /**
  1555. * Creates a new DFA state based on a set of NFA states. Needs to be freed using
  1556. * automaton_destroy_state.
  1557. *
  1558. * @param ctx context
  1559. * @param nfa_states set of NFA states on which the DFA should be based on
  1560. *
  1561. * @return new DFA state
  1562. */
  1563. static struct REGEX_INTERNAL_State *
  1564. dfa_state_create (struct REGEX_INTERNAL_Context *ctx,
  1565. struct REGEX_INTERNAL_StateSet *nfa_states)
  1566. {
  1567. struct REGEX_INTERNAL_State *s;
  1568. char *pos;
  1569. size_t len;
  1570. struct REGEX_INTERNAL_State *cstate;
  1571. struct REGEX_INTERNAL_Transition *ctran;
  1572. unsigned int i;
  1573. s = GNUNET_new (struct REGEX_INTERNAL_State);
  1574. s->id = ctx->state_id++;
  1575. s->index = -1;
  1576. s->lowlink = -1;
  1577. if (NULL == nfa_states)
  1578. {
  1579. GNUNET_asprintf (&s->name, "s%i", s->id);
  1580. return s;
  1581. }
  1582. s->nfa_set = *nfa_states;
  1583. if (nfa_states->off < 1)
  1584. return s;
  1585. /* Create a name based on 'nfa_states' */
  1586. len = nfa_states->off * 14 + 4;
  1587. s->name = GNUNET_malloc (len);
  1588. strcat (s->name, "{");
  1589. pos = s->name + 1;
  1590. for (i = 0; i < nfa_states->off; i++)
  1591. {
  1592. cstate = nfa_states->states[i];
  1593. GNUNET_snprintf (pos, pos - s->name + len, "%i,", cstate->id);
  1594. pos += strlen (pos);
  1595. /* Add a transition for each distinct label to NULL state */
  1596. for (ctran = cstate->transitions_head; NULL != ctran; ctran = ctran->next)
  1597. if (NULL != ctran->label)
  1598. state_add_transition (ctx, s, ctran->label, NULL);
  1599. /* If the nfa_states contain an accepting state, the new dfa state is also
  1600. * accepting. */
  1601. if (cstate->accepting)
  1602. s->accepting = 1;
  1603. }
  1604. pos[-1] = '}';
  1605. s->name = GNUNET_realloc (s->name, strlen (s->name) + 1);
  1606. memset (nfa_states, 0, sizeof(struct REGEX_INTERNAL_StateSet));
  1607. return s;
  1608. }
  1609. /**
  1610. * Move from the given state 's' to the next state on transition 'str'. Consumes
  1611. * as much of the given 'str' as possible (useful for strided DFAs). On return
  1612. * 's' will point to the next state, and the length of the substring used for
  1613. * this transition will be returned. If no transition possible 0 is returned and
  1614. * 's' points to NULL.
  1615. *
  1616. * @param s starting state, will point to the next state or NULL (if no
  1617. * transition possible)
  1618. * @param str edge label to follow (will match longest common prefix)
  1619. *
  1620. * @return length of the substring consumed from 'str'
  1621. */
  1622. static unsigned int
  1623. dfa_move (struct REGEX_INTERNAL_State **s, const char *str)
  1624. {
  1625. struct REGEX_INTERNAL_Transition *t;
  1626. struct REGEX_INTERNAL_State *new_s;
  1627. unsigned int len;
  1628. unsigned int max_len;
  1629. if (NULL == s)
  1630. return 0;
  1631. new_s = NULL;
  1632. max_len = 0;
  1633. for (t = (*s)->transitions_head; NULL != t; t = t->next)
  1634. {
  1635. len = strlen (t->label);
  1636. if (0 == strncmp (t->label, str, len))
  1637. {
  1638. if (len >= max_len)
  1639. {
  1640. max_len = len;
  1641. new_s = t->to_state;
  1642. }
  1643. }
  1644. }
  1645. *s = new_s;
  1646. return max_len;
  1647. }
  1648. /**
  1649. * Set the given state 'marked' to #GNUNET_YES. Used by the
  1650. * #dfa_remove_unreachable_states() function to detect unreachable states in the
  1651. * automaton.
  1652. *
  1653. * @param cls closure, not used.
  1654. * @param count count, not used.
  1655. * @param s state where the marked attribute will be set to #GNUNET_YES.
  1656. */
  1657. static void
  1658. mark_states (void *cls,
  1659. const unsigned int count,
  1660. struct REGEX_INTERNAL_State *s)
  1661. {
  1662. s->marked = GNUNET_YES;
  1663. }
  1664. /**
  1665. * Remove all unreachable states from DFA 'a'. Unreachable states are those
  1666. * states that are not reachable from the starting state.
  1667. *
  1668. * @param a DFA automaton
  1669. */
  1670. static void
  1671. dfa_remove_unreachable_states (struct REGEX_INTERNAL_Automaton *a)
  1672. {
  1673. struct REGEX_INTERNAL_State *s;
  1674. struct REGEX_INTERNAL_State *s_next;
  1675. /* 1. unmark all states */
  1676. for (s = a->states_head; NULL != s; s = s->next)
  1677. s->marked = GNUNET_NO;
  1678. /* 2. traverse dfa from start state and mark all visited states */
  1679. REGEX_INTERNAL_automaton_traverse (a,
  1680. a->start,
  1681. NULL,
  1682. NULL,
  1683. &mark_states,
  1684. NULL);
  1685. /* 3. delete all states that were not visited */
  1686. for (s = a->states_head; NULL != s; s = s_next)
  1687. {
  1688. s_next = s->next;
  1689. if (GNUNET_NO == s->marked)
  1690. automaton_remove_state (a, s);
  1691. }
  1692. }
  1693. /**
  1694. * Remove all dead states from the DFA 'a'. Dead states are those states that do
  1695. * not transition to any other state but themselves.
  1696. *
  1697. * @param a DFA automaton
  1698. */
  1699. static void
  1700. dfa_remove_dead_states (struct REGEX_INTERNAL_Automaton *a)
  1701. {
  1702. struct REGEX_INTERNAL_State *s;
  1703. struct REGEX_INTERNAL_State *s_next;
  1704. struct REGEX_INTERNAL_Transition *t;
  1705. int dead;
  1706. GNUNET_assert (DFA == a->type);
  1707. for (s = a->states_head; NULL != s; s = s_next)
  1708. {
  1709. s_next = s->next;
  1710. if (s->accepting)
  1711. continue;
  1712. dead = 1;
  1713. for (t = s->transitions_head; NULL != t; t = t->next)
  1714. {
  1715. if ((NULL != t->to_state) && (t->to_state != s) )
  1716. {
  1717. dead = 0;
  1718. break;
  1719. }
  1720. }
  1721. if (0 == dead)
  1722. continue;
  1723. /* state s is dead, remove it */
  1724. automaton_remove_state (a, s);
  1725. }
  1726. }
  1727. /**
  1728. * Merge all non distinguishable states in the DFA 'a'
  1729. *
  1730. * @param ctx context
  1731. * @param a DFA automaton
  1732. * @return #GNUNET_OK on success
  1733. */
  1734. static int
  1735. dfa_merge_nondistinguishable_states (struct REGEX_INTERNAL_Context *ctx,
  1736. struct REGEX_INTERNAL_Automaton *a)
  1737. {
  1738. uint32_t *table;
  1739. struct REGEX_INTERNAL_State *s1;
  1740. struct REGEX_INTERNAL_State *s2;
  1741. struct REGEX_INTERNAL_Transition *t1;
  1742. struct REGEX_INTERNAL_Transition *t2;
  1743. struct REGEX_INTERNAL_State *s1_next;
  1744. struct REGEX_INTERNAL_State *s2_next;
  1745. int change;
  1746. unsigned int num_equal_edges;
  1747. unsigned int i;
  1748. unsigned int state_cnt;
  1749. unsigned long long idx;
  1750. unsigned long long idx1;
  1751. if ((NULL == a) || (0 == a->state_count))
  1752. {
  1753. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  1754. "Could not merge nondistinguishable states, automaton was NULL.\n");
  1755. return GNUNET_SYSERR;
  1756. }
  1757. state_cnt = a->state_count;
  1758. table = GNUNET_malloc_large (
  1759. (sizeof(uint32_t) * state_cnt * state_cnt / 32) + sizeof(uint32_t));
  1760. if (NULL == table)
  1761. {
  1762. GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc");
  1763. return GNUNET_SYSERR;
  1764. }
  1765. for (i = 0, s1 = a->states_head; NULL != s1; s1 = s1->next)
  1766. s1->marked = i++;
  1767. /* Mark all pairs of accepting/!accepting states */
  1768. for (s1 = a->states_head; NULL != s1; s1 = s1->next)
  1769. for (s2 = a->states_head; NULL != s2; s2 = s2->next)
  1770. if ((s1->accepting && ! s2->accepting) ||
  1771. (! s1->accepting && s2->accepting))
  1772. {
  1773. idx = (unsigned long long) s1->marked * state_cnt + s2->marked;
  1774. table[idx / 32] |= (1U << (idx % 32));
  1775. }
  1776. /* Find all equal states */
  1777. change = 1;
  1778. while (0 != change)
  1779. {
  1780. change = 0;
  1781. for (s1 = a->states_head; NULL != s1; s1 = s1->next)
  1782. {
  1783. for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2->next)
  1784. {
  1785. idx = (unsigned long long) s1->marked * state_cnt + s2->marked;
  1786. if (0 != (table[idx / 32] & (1U << (idx % 32))))
  1787. continue;
  1788. num_equal_edges = 0;
  1789. for (t1 = s1->transitions_head; NULL != t1; t1 = t1->next)
  1790. {
  1791. for (t2 = s2->transitions_head; NULL != t2; t2 = t2->next)
  1792. {
  1793. if (0 == strcmp (t1->label, t2->label))
  1794. {
  1795. num_equal_edges++;
  1796. /* same edge, but targets definitively different, so we're different
  1797. as well */
  1798. if (t1->to_state->marked > t2->to_state->marked)
  1799. idx1 = (unsigned long long) t1->to_state->marked * state_cnt
  1800. + t2->to_state->marked;
  1801. else
  1802. idx1 = (unsigned long long) t2->to_state->marked * state_cnt
  1803. + t1->to_state->marked;
  1804. if (0 != (table[idx1 / 32] & (1U << (idx1 % 32))))
  1805. {
  1806. table[idx / 32] |= (1U << (idx % 32));
  1807. change = 1; /* changed a marker, need to run again */
  1808. }
  1809. }
  1810. }
  1811. }
  1812. if ((num_equal_edges != s1->transition_count) ||
  1813. (num_equal_edges != s2->transition_count))
  1814. {
  1815. /* Make sure ALL edges of possible equal states are the same */
  1816. table[idx / 32] |= (1U << (idx % 32));
  1817. change = 1; /* changed a marker, need to run again */
  1818. }
  1819. }
  1820. }
  1821. }
  1822. /* Merge states that are equal */
  1823. for (s1 = a->states_head; NULL != s1; s1 = s1_next)
  1824. {
  1825. s1_next = s1->next;
  1826. for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2_next)
  1827. {
  1828. s2_next = s2->next;
  1829. idx = (unsigned long long) s1->marked * state_cnt + s2->marked;
  1830. if (0 == (table[idx / 32] & (1U << (idx % 32))))
  1831. automaton_merge_states (ctx, a, s1, s2);
  1832. }
  1833. }
  1834. GNUNET_free (table);
  1835. return GNUNET_OK;
  1836. }
  1837. /**
  1838. * Minimize the given DFA 'a' by removing all unreachable states, removing all
  1839. * dead states and merging all non distinguishable states
  1840. *
  1841. * @param ctx context
  1842. * @param a DFA automaton
  1843. * @return GNUNET_OK on success
  1844. */
  1845. static int
  1846. dfa_minimize (struct REGEX_INTERNAL_Context *ctx,
  1847. struct REGEX_INTERNAL_Automaton *a)
  1848. {
  1849. if (NULL == a)
  1850. return GNUNET_SYSERR;
  1851. GNUNET_assert (DFA == a->type);
  1852. /* 1. remove unreachable states */
  1853. dfa_remove_unreachable_states (a);
  1854. /* 2. remove dead states */
  1855. dfa_remove_dead_states (a);
  1856. /* 3. Merge nondistinguishable states */
  1857. if (GNUNET_OK != dfa_merge_nondistinguishable_states (ctx, a))
  1858. return GNUNET_SYSERR;
  1859. return GNUNET_OK;
  1860. }
  1861. /**
  1862. * Context for adding strided transitions to a DFA.
  1863. */
  1864. struct REGEX_INTERNAL_Strided_Context
  1865. {
  1866. /**
  1867. * Length of the strides.
  1868. */
  1869. const unsigned int stride;
  1870. /**
  1871. * Strided transitions DLL. New strided transitions will be stored in this DLL
  1872. * and afterwards added to the DFA.
  1873. */
  1874. struct REGEX_INTERNAL_Transition *transitions_head;
  1875. /**
  1876. * Strided transitions DLL.
  1877. */
  1878. struct REGEX_INTERNAL_Transition *transitions_tail;
  1879. };
  1880. /**
  1881. * Recursive helper function to add strides to a DFA.
  1882. *
  1883. * @param cls context, contains stride length and strided transitions DLL.
  1884. * @param depth current depth of the depth-first traversal of the graph.
  1885. * @param label current label, string that contains all labels on the path from
  1886. * 'start' to 's'.
  1887. * @param start start state for the depth-first traversal of the graph.
  1888. * @param s current state in the depth-first traversal
  1889. */
  1890. static void
  1891. dfa_add_multi_strides_helper (void *cls,
  1892. const unsigned int depth,
  1893. char *label,
  1894. struct REGEX_INTERNAL_State *start,
  1895. struct REGEX_INTERNAL_State *s)
  1896. {
  1897. struct REGEX_INTERNAL_Strided_Context *ctx = cls;
  1898. struct REGEX_INTERNAL_Transition *t;
  1899. char *new_label;
  1900. if (depth == ctx->stride)
  1901. {
  1902. t = GNUNET_new (struct REGEX_INTERNAL_Transition);
  1903. t->label = GNUNET_strdup (label);
  1904. t->to_state = s;
  1905. t->from_state = start;
  1906. GNUNET_CONTAINER_DLL_insert (ctx->transitions_head,
  1907. ctx->transitions_tail,
  1908. t);
  1909. }
  1910. else
  1911. {
  1912. for (t = s->transitions_head; NULL != t; t = t->next)
  1913. {
  1914. /* Do not consider self-loops, because it end's up in too many
  1915. * transitions */
  1916. if (t->to_state == t->from_state)
  1917. continue;
  1918. if (NULL != label)
  1919. {
  1920. GNUNET_asprintf (&new_label, "%s%s", label, t->label);
  1921. }
  1922. else
  1923. new_label = GNUNET_strdup (t->label);
  1924. dfa_add_multi_strides_helper (cls,
  1925. (depth + 1),
  1926. new_label,
  1927. start,
  1928. t->to_state);
  1929. }
  1930. }
  1931. GNUNET_free (label);
  1932. }
  1933. /**
  1934. * Function called for each state in the DFA. Starts a traversal of depth set in
  1935. * context starting from state 's'.
  1936. *
  1937. * @param cls context.
  1938. * @param count not used.
  1939. * @param s current state.
  1940. */
  1941. static void
  1942. dfa_add_multi_strides (void *cls,
  1943. const unsigned int count,
  1944. struct REGEX_INTERNAL_State *s)
  1945. {
  1946. dfa_add_multi_strides_helper (cls, 0, NULL, s, s);
  1947. }
  1948. /**
  1949. * Adds multi-strided transitions to the given 'dfa'.
  1950. *
  1951. * @param regex_ctx regex context needed to add transitions to the automaton.
  1952. * @param dfa DFA to which the multi strided transitions should be added.
  1953. * @param stride_len length of the strides.
  1954. */
  1955. void
  1956. REGEX_INTERNAL_dfa_add_multi_strides (struct REGEX_INTERNAL_Context *regex_ctx,
  1957. struct REGEX_INTERNAL_Automaton *dfa,
  1958. const unsigned int stride_len)
  1959. {
  1960. struct REGEX_INTERNAL_Strided_Context ctx = { stride_len, NULL, NULL };
  1961. struct REGEX_INTERNAL_Transition *t;
  1962. struct REGEX_INTERNAL_Transition *t_next;
  1963. if ((1 > stride_len) || (GNUNET_YES == dfa->is_multistrided))
  1964. return;
  1965. /* Compute the new transitions of given stride_len */
  1966. REGEX_INTERNAL_automaton_traverse (dfa,
  1967. dfa->start,
  1968. NULL,
  1969. NULL,
  1970. &dfa_add_multi_strides,
  1971. &ctx);
  1972. /* Add all the new transitions to the automaton. */
  1973. for (t = ctx.transitions_head; NULL != t; t = t_next)
  1974. {
  1975. t_next = t->next;
  1976. state_add_transition (regex_ctx, t->from_state, t->label, t->to_state);
  1977. GNUNET_CONTAINER_DLL_remove (ctx.transitions_head, ctx.transitions_tail, t);
  1978. GNUNET_free (t->label);
  1979. GNUNET_free (t);
  1980. }
  1981. /* Mark this automaton as multistrided */
  1982. dfa->is_multistrided = GNUNET_YES;
  1983. }
  1984. /**
  1985. * Recursive Helper function for DFA path compression. Does DFS on the DFA graph
  1986. * and adds new transitions to the given transitions DLL and marks states that
  1987. * should be removed by setting state->contained to GNUNET_YES.
  1988. *
  1989. * @param dfa DFA for which the paths should be compressed.
  1990. * @param start starting state for linear path search.
  1991. * @param cur current state in the recursive DFS.
  1992. * @param label current label (string of traversed labels).
  1993. * @param max_len maximal path compression length.
  1994. * @param transitions_head transitions DLL.
  1995. * @param transitions_tail transitions DLL.
  1996. */
  1997. void
  1998. dfa_compress_paths_helper (struct REGEX_INTERNAL_Automaton *dfa,
  1999. struct REGEX_INTERNAL_State *start,
  2000. struct REGEX_INTERNAL_State *cur,
  2001. char *label,
  2002. unsigned int max_len,
  2003. struct REGEX_INTERNAL_Transition **transitions_head,
  2004. struct REGEX_INTERNAL_Transition **transitions_tail)
  2005. {
  2006. struct REGEX_INTERNAL_Transition *t;
  2007. char *new_label;
  2008. if ((NULL != label) &&
  2009. (((cur->incoming_transition_count > 1) || (GNUNET_YES ==
  2010. cur->accepting) ||
  2011. (GNUNET_YES == cur->marked) ) ||
  2012. ((start != dfa->start) && (max_len > 0) && (max_len == strlen (
  2013. label))) ||
  2014. ((start == dfa->start) && (GNUNET_REGEX_INITIAL_BYTES == strlen (
  2015. label)))))
  2016. {
  2017. t = GNUNET_new (struct REGEX_INTERNAL_Transition);
  2018. t->label = GNUNET_strdup (label);
  2019. t->to_state = cur;
  2020. t->from_state = start;
  2021. GNUNET_CONTAINER_DLL_insert (*transitions_head, *transitions_tail, t);
  2022. if (GNUNET_NO == cur->marked)
  2023. {
  2024. dfa_compress_paths_helper (dfa,
  2025. cur,
  2026. cur,
  2027. NULL,
  2028. max_len,
  2029. transitions_head,
  2030. transitions_tail);
  2031. }
  2032. return;
  2033. }
  2034. else if (cur != start)
  2035. cur->contained = GNUNET_YES;
  2036. if ((GNUNET_YES == cur->marked) && (cur != start))
  2037. return;
  2038. cur->marked = GNUNET_YES;
  2039. for (t = cur->transitions_head; NULL != t; t = t->next)
  2040. {
  2041. if (NULL != label)
  2042. GNUNET_asprintf (&new_label, "%s%s", label, t->label);
  2043. else
  2044. new_label = GNUNET_strdup (t->label);
  2045. if (t->to_state != cur)
  2046. {
  2047. dfa_compress_paths_helper (dfa,
  2048. start,
  2049. t->to_state,
  2050. new_label,
  2051. max_len,
  2052. transitions_head,
  2053. transitions_tail);
  2054. }
  2055. GNUNET_free (new_label);
  2056. }
  2057. }
  2058. /**
  2059. * Compress paths in the given 'dfa'. Linear paths like 0->1->2->3 will be
  2060. * compressed to 0->3 by combining transitions.
  2061. *
  2062. * @param regex_ctx context for adding new transitions.
  2063. * @param dfa DFA representation, will directly modify the given DFA.
  2064. * @param max_len maximal length of the compressed paths.
  2065. */
  2066. static void
  2067. dfa_compress_paths (struct REGEX_INTERNAL_Context *regex_ctx,
  2068. struct REGEX_INTERNAL_Automaton *dfa,
  2069. unsigned int max_len)
  2070. {
  2071. struct REGEX_INTERNAL_State *s;
  2072. struct REGEX_INTERNAL_State *s_next;
  2073. struct REGEX_INTERNAL_Transition *t;
  2074. struct REGEX_INTERNAL_Transition *t_next;
  2075. struct REGEX_INTERNAL_Transition *transitions_head = NULL;
  2076. struct REGEX_INTERNAL_Transition *transitions_tail = NULL;
  2077. if (NULL == dfa)
  2078. return;
  2079. /* Count the incoming transitions on each state. */
  2080. for (s = dfa->states_head; NULL != s; s = s->next)
  2081. {
  2082. for (t = s->transitions_head; NULL != t; t = t->next)
  2083. {
  2084. if (NULL != t->to_state)
  2085. t->to_state->incoming_transition_count++;
  2086. }
  2087. }
  2088. /* Unmark all states. */
  2089. for (s = dfa->states_head; NULL != s; s = s->next)
  2090. {
  2091. s->marked = GNUNET_NO;
  2092. s->contained = GNUNET_NO;
  2093. }
  2094. /* Add strides and mark states that can be deleted. */
  2095. dfa_compress_paths_helper (dfa,
  2096. dfa->start,
  2097. dfa->start,
  2098. NULL,
  2099. max_len,
  2100. &transitions_head,
  2101. &transitions_tail);
  2102. /* Add all the new transitions to the automaton. */
  2103. for (t = transitions_head; NULL != t; t = t_next)
  2104. {
  2105. t_next = t->next;
  2106. state_add_transition (regex_ctx, t->from_state, t->label, t->to_state);
  2107. GNUNET_CONTAINER_DLL_remove (transitions_head, transitions_tail, t);
  2108. GNUNET_free (t->label);
  2109. GNUNET_free (t);
  2110. }
  2111. /* Remove marked states (including their incoming and outgoing transitions). */
  2112. for (s = dfa->states_head; NULL != s; s = s_next)
  2113. {
  2114. s_next = s->next;
  2115. if (GNUNET_YES == s->contained)
  2116. automaton_remove_state (dfa, s);
  2117. }
  2118. }
  2119. /**
  2120. * Creates a new NFA fragment. Needs to be cleared using
  2121. * automaton_fragment_clear.
  2122. *
  2123. * @param start starting state
  2124. * @param end end state
  2125. *
  2126. * @return new NFA fragment
  2127. */
  2128. static struct REGEX_INTERNAL_Automaton *
  2129. nfa_fragment_create (struct REGEX_INTERNAL_State *start,
  2130. struct REGEX_INTERNAL_State *end)
  2131. {
  2132. struct REGEX_INTERNAL_Automaton *n;
  2133. n = GNUNET_new (struct REGEX_INTERNAL_Automaton);
  2134. n->type = NFA;
  2135. n->start = NULL;
  2136. n->end = NULL;
  2137. n->state_count = 0;
  2138. if ((NULL == start) || (NULL == end))
  2139. return n;
  2140. automaton_add_state (n, end);
  2141. automaton_add_state (n, start);
  2142. n->state_count = 2;
  2143. n->start = start;
  2144. n->end = end;
  2145. return n;
  2146. }
  2147. /**
  2148. * Adds a list of states to the given automaton 'n'.
  2149. *
  2150. * @param n automaton to which the states should be added
  2151. * @param states_head head of the DLL of states
  2152. * @param states_tail tail of the DLL of states
  2153. */
  2154. static void
  2155. nfa_add_states (struct REGEX_INTERNAL_Automaton *n,
  2156. struct REGEX_INTERNAL_State *states_head,
  2157. struct REGEX_INTERNAL_State *states_tail)
  2158. {
  2159. struct REGEX_INTERNAL_State *s;
  2160. if ((NULL == n) || (NULL == states_head))
  2161. {
  2162. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not add states\n");
  2163. return;
  2164. }
  2165. if (NULL == n->states_head)
  2166. {
  2167. n->states_head = states_head;
  2168. n->states_tail = states_tail;
  2169. return;
  2170. }
  2171. if (NULL != states_head)
  2172. {
  2173. n->states_tail->next = states_head;
  2174. n->states_tail = states_tail;
  2175. }
  2176. for (s = states_head; NULL != s; s = s->next)
  2177. n->state_count++;
  2178. }
  2179. /**
  2180. * Creates a new NFA state. Needs to be freed using automaton_destroy_state.
  2181. *
  2182. * @param ctx context
  2183. * @param accepting is it an accepting state or not
  2184. *
  2185. * @return new NFA state
  2186. */
  2187. static struct REGEX_INTERNAL_State *
  2188. nfa_state_create (struct REGEX_INTERNAL_Context *ctx, int accepting)
  2189. {
  2190. struct REGEX_INTERNAL_State *s;
  2191. s = GNUNET_new (struct REGEX_INTERNAL_State);
  2192. s->id = ctx->state_id++;
  2193. s->accepting = accepting;
  2194. s->marked = GNUNET_NO;
  2195. s->contained = 0;
  2196. s->index = -1;
  2197. s->lowlink = -1;
  2198. s->scc_id = 0;
  2199. s->name = NULL;
  2200. GNUNET_asprintf (&s->name, "s%i", s->id);
  2201. return s;
  2202. }
  2203. /**
  2204. * Calculates the closure set for the given set of states.
  2205. *
  2206. * @param ret set to sorted nfa closure on 'label' (epsilon closure if 'label' is NULL)
  2207. * @param nfa the NFA containing 's'
  2208. * @param states list of states on which to base the closure on
  2209. * @param label transitioning label for which to base the closure on,
  2210. * pass NULL for epsilon transition
  2211. */
  2212. static void
  2213. nfa_closure_set_create (struct REGEX_INTERNAL_StateSet *ret,
  2214. struct REGEX_INTERNAL_Automaton *nfa,
  2215. struct REGEX_INTERNAL_StateSet *states,
  2216. const char *label)
  2217. {
  2218. struct REGEX_INTERNAL_State *s;
  2219. unsigned int i;
  2220. struct REGEX_INTERNAL_StateSet_MDLL cls_stack;
  2221. struct REGEX_INTERNAL_State *clsstate;
  2222. struct REGEX_INTERNAL_State *currentstate;
  2223. struct REGEX_INTERNAL_Transition *ctran;
  2224. memset (ret, 0, sizeof(struct REGEX_INTERNAL_StateSet));
  2225. if (NULL == states)
  2226. return;
  2227. for (i = 0; i < states->off; i++)
  2228. {
  2229. s = states->states[i];
  2230. /* Add start state to closure only for epsilon closure */
  2231. if (NULL == label)
  2232. state_set_append (ret, s);
  2233. /* initialize work stack */
  2234. cls_stack.head = NULL;
  2235. cls_stack.tail = NULL;
  2236. GNUNET_CONTAINER_MDLL_insert (ST, cls_stack.head, cls_stack.tail, s);
  2237. cls_stack.len = 1;
  2238. while (NULL != (currentstate = cls_stack.tail))
  2239. {
  2240. GNUNET_CONTAINER_MDLL_remove (ST,
  2241. cls_stack.head,
  2242. cls_stack.tail,
  2243. currentstate);
  2244. cls_stack.len--;
  2245. for (ctran = currentstate->transitions_head; NULL != ctran;
  2246. ctran = ctran->next)
  2247. {
  2248. if (NULL == (clsstate = ctran->to_state))
  2249. continue;
  2250. if (0 != clsstate->contained)
  2251. continue;
  2252. if (0 != nullstrcmp (label, ctran->label))
  2253. continue;
  2254. state_set_append (ret, clsstate);
  2255. GNUNET_CONTAINER_MDLL_insert_tail (ST,
  2256. cls_stack.head,
  2257. cls_stack.tail,
  2258. clsstate);
  2259. cls_stack.len++;
  2260. clsstate->contained = 1;
  2261. }
  2262. }
  2263. }
  2264. for (i = 0; i < ret->off; i++)
  2265. ret->states[i]->contained = 0;
  2266. if (ret->off > 1)
  2267. qsort (ret->states,
  2268. ret->off,
  2269. sizeof(struct REGEX_INTERNAL_State *),
  2270. &state_compare);
  2271. }
  2272. /**
  2273. * Pops two NFA fragments (a, b) from the stack and concatenates them (ab)
  2274. *
  2275. * @param ctx context
  2276. */
  2277. static void
  2278. nfa_add_concatenation (struct REGEX_INTERNAL_Context *ctx)
  2279. {
  2280. struct REGEX_INTERNAL_Automaton *a;
  2281. struct REGEX_INTERNAL_Automaton *b;
  2282. struct REGEX_INTERNAL_Automaton *new_nfa;
  2283. b = ctx->stack_tail;
  2284. GNUNET_assert (NULL != b);
  2285. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b);
  2286. a = ctx->stack_tail;
  2287. GNUNET_assert (NULL != a);
  2288. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2289. state_add_transition (ctx, a->end, NULL, b->start);
  2290. a->end->accepting = 0;
  2291. b->end->accepting = 1;
  2292. new_nfa = nfa_fragment_create (NULL, NULL);
  2293. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2294. nfa_add_states (new_nfa, b->states_head, b->states_tail);
  2295. new_nfa->start = a->start;
  2296. new_nfa->end = b->end;
  2297. new_nfa->state_count += a->state_count + b->state_count;
  2298. automaton_fragment_clear (a);
  2299. automaton_fragment_clear (b);
  2300. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2301. }
  2302. /**
  2303. * Pops a NFA fragment from the stack (a) and adds a new fragment (a*)
  2304. *
  2305. * @param ctx context
  2306. */
  2307. static void
  2308. nfa_add_star_op (struct REGEX_INTERNAL_Context *ctx)
  2309. {
  2310. struct REGEX_INTERNAL_Automaton *a;
  2311. struct REGEX_INTERNAL_Automaton *new_nfa;
  2312. struct REGEX_INTERNAL_State *start;
  2313. struct REGEX_INTERNAL_State *end;
  2314. a = ctx->stack_tail;
  2315. if (NULL == a)
  2316. {
  2317. GNUNET_log (
  2318. GNUNET_ERROR_TYPE_ERROR,
  2319. "nfa_add_star_op failed, because there was no element on the stack");
  2320. return;
  2321. }
  2322. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2323. start = nfa_state_create (ctx, 0);
  2324. end = nfa_state_create (ctx, 1);
  2325. state_add_transition (ctx, start, NULL, a->start);
  2326. state_add_transition (ctx, start, NULL, end);
  2327. state_add_transition (ctx, a->end, NULL, a->start);
  2328. state_add_transition (ctx, a->end, NULL, end);
  2329. a->end->accepting = 0;
  2330. end->accepting = 1;
  2331. new_nfa = nfa_fragment_create (start, end);
  2332. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2333. automaton_fragment_clear (a);
  2334. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2335. }
  2336. /**
  2337. * Pops an NFA fragment (a) from the stack and adds a new fragment (a+)
  2338. *
  2339. * @param ctx context
  2340. */
  2341. static void
  2342. nfa_add_plus_op (struct REGEX_INTERNAL_Context *ctx)
  2343. {
  2344. struct REGEX_INTERNAL_Automaton *a;
  2345. a = ctx->stack_tail;
  2346. if (NULL == a)
  2347. {
  2348. GNUNET_log (
  2349. GNUNET_ERROR_TYPE_ERROR,
  2350. "nfa_add_plus_op failed, because there was no element on the stack");
  2351. return;
  2352. }
  2353. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2354. state_add_transition (ctx, a->end, NULL, a->start);
  2355. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, a);
  2356. }
  2357. /**
  2358. * Pops an NFA fragment (a) from the stack and adds a new fragment (a?)
  2359. *
  2360. * @param ctx context
  2361. */
  2362. static void
  2363. nfa_add_question_op (struct REGEX_INTERNAL_Context *ctx)
  2364. {
  2365. struct REGEX_INTERNAL_Automaton *a;
  2366. struct REGEX_INTERNAL_Automaton *new_nfa;
  2367. struct REGEX_INTERNAL_State *start;
  2368. struct REGEX_INTERNAL_State *end;
  2369. a = ctx->stack_tail;
  2370. if (NULL == a)
  2371. {
  2372. GNUNET_log (
  2373. GNUNET_ERROR_TYPE_ERROR,
  2374. "nfa_add_question_op failed, because there was no element on the stack");
  2375. return;
  2376. }
  2377. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2378. start = nfa_state_create (ctx, 0);
  2379. end = nfa_state_create (ctx, 1);
  2380. state_add_transition (ctx, start, NULL, a->start);
  2381. state_add_transition (ctx, start, NULL, end);
  2382. state_add_transition (ctx, a->end, NULL, end);
  2383. a->end->accepting = 0;
  2384. new_nfa = nfa_fragment_create (start, end);
  2385. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2386. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2387. automaton_fragment_clear (a);
  2388. }
  2389. /**
  2390. * Pops two NFA fragments (a, b) from the stack and adds a new NFA fragment that
  2391. * alternates between a and b (a|b)
  2392. *
  2393. * @param ctx context
  2394. */
  2395. static void
  2396. nfa_add_alternation (struct REGEX_INTERNAL_Context *ctx)
  2397. {
  2398. struct REGEX_INTERNAL_Automaton *a;
  2399. struct REGEX_INTERNAL_Automaton *b;
  2400. struct REGEX_INTERNAL_Automaton *new_nfa;
  2401. struct REGEX_INTERNAL_State *start;
  2402. struct REGEX_INTERNAL_State *end;
  2403. b = ctx->stack_tail;
  2404. GNUNET_assert (NULL != b);
  2405. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b);
  2406. a = ctx->stack_tail;
  2407. GNUNET_assert (NULL != a);
  2408. GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a);
  2409. start = nfa_state_create (ctx, 0);
  2410. end = nfa_state_create (ctx, 1);
  2411. state_add_transition (ctx, start, NULL, a->start);
  2412. state_add_transition (ctx, start, NULL, b->start);
  2413. state_add_transition (ctx, a->end, NULL, end);
  2414. state_add_transition (ctx, b->end, NULL, end);
  2415. a->end->accepting = 0;
  2416. b->end->accepting = 0;
  2417. end->accepting = 1;
  2418. new_nfa = nfa_fragment_create (start, end);
  2419. nfa_add_states (new_nfa, a->states_head, a->states_tail);
  2420. nfa_add_states (new_nfa, b->states_head, b->states_tail);
  2421. automaton_fragment_clear (a);
  2422. automaton_fragment_clear (b);
  2423. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa);
  2424. }
  2425. /**
  2426. * Adds a new nfa fragment to the stack
  2427. *
  2428. * @param ctx context
  2429. * @param label label for nfa transition
  2430. */
  2431. static void
  2432. nfa_add_label (struct REGEX_INTERNAL_Context *ctx, const char *label)
  2433. {
  2434. struct REGEX_INTERNAL_Automaton *n;
  2435. struct REGEX_INTERNAL_State *start;
  2436. struct REGEX_INTERNAL_State *end;
  2437. GNUNET_assert (NULL != ctx);
  2438. start = nfa_state_create (ctx, 0);
  2439. end = nfa_state_create (ctx, 1);
  2440. state_add_transition (ctx, start, label, end);
  2441. n = nfa_fragment_create (start, end);
  2442. GNUNET_assert (NULL != n);
  2443. GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, n);
  2444. }
  2445. /**
  2446. * Initialize a new context
  2447. *
  2448. * @param ctx context
  2449. */
  2450. static void
  2451. REGEX_INTERNAL_context_init (struct REGEX_INTERNAL_Context *ctx)
  2452. {
  2453. if (NULL == ctx)
  2454. {
  2455. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Context was NULL!");
  2456. return;
  2457. }
  2458. ctx->state_id = 0;
  2459. ctx->transition_id = 0;
  2460. ctx->stack_head = NULL;
  2461. ctx->stack_tail = NULL;
  2462. }
  2463. /**
  2464. * Construct an NFA by parsing the regex string of length 'len'.
  2465. *
  2466. * @param regex regular expression string
  2467. * @param len length of the string
  2468. *
  2469. * @return NFA, needs to be freed using REGEX_INTERNAL_destroy_automaton
  2470. */
  2471. struct REGEX_INTERNAL_Automaton *
  2472. REGEX_INTERNAL_construct_nfa (const char *regex, const size_t len)
  2473. {
  2474. struct REGEX_INTERNAL_Context ctx;
  2475. struct REGEX_INTERNAL_Automaton *nfa;
  2476. const char *regexp;
  2477. char curlabel[2];
  2478. char *error_msg;
  2479. unsigned int count;
  2480. unsigned int altcount;
  2481. unsigned int atomcount;
  2482. unsigned int poff;
  2483. unsigned int psize;
  2484. struct
  2485. {
  2486. int altcount;
  2487. int atomcount;
  2488. } *p;
  2489. if ((NULL == regex) || (0 == strlen (regex)) || (0 == len))
  2490. {
  2491. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2492. "Could not parse regex. Empty regex string provided.\n");
  2493. return NULL;
  2494. }
  2495. REGEX_INTERNAL_context_init (&ctx);
  2496. regexp = regex;
  2497. curlabel[1] = '\0';
  2498. p = NULL;
  2499. error_msg = NULL;
  2500. altcount = 0;
  2501. atomcount = 0;
  2502. poff = 0;
  2503. psize = 0;
  2504. for (count = 0; count < len && *regexp; count++, regexp++)
  2505. {
  2506. switch (*regexp)
  2507. {
  2508. case '(':
  2509. if (atomcount > 1)
  2510. {
  2511. --atomcount;
  2512. nfa_add_concatenation (&ctx);
  2513. }
  2514. if (poff == psize)
  2515. GNUNET_array_grow (p, psize, psize * 2 + 4); /* FIXME why *2 +4? */
  2516. p[poff].altcount = altcount;
  2517. p[poff].atomcount = atomcount;
  2518. poff++;
  2519. altcount = 0;
  2520. atomcount = 0;
  2521. break;
  2522. case '|':
  2523. if (0 == atomcount)
  2524. {
  2525. error_msg = "Cannot append '|' to nothing";
  2526. goto error;
  2527. }
  2528. while (--atomcount > 0)
  2529. nfa_add_concatenation (&ctx);
  2530. altcount++;
  2531. break;
  2532. case ')':
  2533. if (0 == poff)
  2534. {
  2535. error_msg = "Missing opening '('";
  2536. goto error;
  2537. }
  2538. if (0 == atomcount)
  2539. {
  2540. /* Ignore this: "()" */
  2541. poff--;
  2542. altcount = p[poff].altcount;
  2543. atomcount = p[poff].atomcount;
  2544. break;
  2545. }
  2546. while (--atomcount > 0)
  2547. nfa_add_concatenation (&ctx);
  2548. for (; altcount > 0; altcount--)
  2549. nfa_add_alternation (&ctx);
  2550. poff--;
  2551. altcount = p[poff].altcount;
  2552. atomcount = p[poff].atomcount;
  2553. atomcount++;
  2554. break;
  2555. case '*':
  2556. if (atomcount == 0)
  2557. {
  2558. error_msg = "Cannot append '*' to nothing";
  2559. goto error;
  2560. }
  2561. nfa_add_star_op (&ctx);
  2562. break;
  2563. case '+':
  2564. if (atomcount == 0)
  2565. {
  2566. error_msg = "Cannot append '+' to nothing";
  2567. goto error;
  2568. }
  2569. nfa_add_plus_op (&ctx);
  2570. break;
  2571. case '?':
  2572. if (atomcount == 0)
  2573. {
  2574. error_msg = "Cannot append '?' to nothing";
  2575. goto error;
  2576. }
  2577. nfa_add_question_op (&ctx);
  2578. break;
  2579. default:
  2580. if (atomcount > 1)
  2581. {
  2582. --atomcount;
  2583. nfa_add_concatenation (&ctx);
  2584. }
  2585. curlabel[0] = *regexp;
  2586. nfa_add_label (&ctx, curlabel);
  2587. atomcount++;
  2588. break;
  2589. }
  2590. }
  2591. if (0 != poff)
  2592. {
  2593. error_msg = "Unbalanced parenthesis";
  2594. goto error;
  2595. }
  2596. while (--atomcount > 0)
  2597. nfa_add_concatenation (&ctx);
  2598. for (; altcount > 0; altcount--)
  2599. nfa_add_alternation (&ctx);
  2600. GNUNET_array_grow (p, psize, 0);
  2601. nfa = ctx.stack_tail;
  2602. GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa);
  2603. if (NULL != ctx.stack_head)
  2604. {
  2605. error_msg = "Creating the NFA failed. NFA stack was not empty!";
  2606. goto error;
  2607. }
  2608. /* Remember the regex that was used to generate this NFA */
  2609. nfa->regex = GNUNET_strdup (regex);
  2610. /* create depth-first numbering of the states for pretty printing */
  2611. REGEX_INTERNAL_automaton_traverse (nfa,
  2612. NULL,
  2613. NULL,
  2614. NULL,
  2615. &number_states,
  2616. NULL);
  2617. /* No multistriding added so far */
  2618. nfa->is_multistrided = GNUNET_NO;
  2619. return nfa;
  2620. error:
  2621. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not parse regex: `%s'\n", regex);
  2622. if (NULL != error_msg)
  2623. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "%s\n", error_msg);
  2624. GNUNET_free (p);
  2625. while (NULL != (nfa = ctx.stack_head))
  2626. {
  2627. GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa);
  2628. REGEX_INTERNAL_automaton_destroy (nfa);
  2629. }
  2630. return NULL;
  2631. }
  2632. /**
  2633. * Create DFA states based on given 'nfa' and starting with 'dfa_state'.
  2634. *
  2635. * @param ctx context.
  2636. * @param nfa NFA automaton.
  2637. * @param dfa DFA automaton.
  2638. * @param dfa_state current dfa state, pass epsilon closure of first nfa state
  2639. * for starting.
  2640. */
  2641. static void
  2642. construct_dfa_states (struct REGEX_INTERNAL_Context *ctx,
  2643. struct REGEX_INTERNAL_Automaton *nfa,
  2644. struct REGEX_INTERNAL_Automaton *dfa,
  2645. struct REGEX_INTERNAL_State *dfa_state)
  2646. {
  2647. struct REGEX_INTERNAL_Transition *ctran;
  2648. struct REGEX_INTERNAL_State *new_dfa_state;
  2649. struct REGEX_INTERNAL_State *state_contains;
  2650. struct REGEX_INTERNAL_State *state_iter;
  2651. struct REGEX_INTERNAL_StateSet tmp;
  2652. struct REGEX_INTERNAL_StateSet nfa_set;
  2653. for (ctran = dfa_state->transitions_head; NULL != ctran; ctran = ctran->next)
  2654. {
  2655. if ((NULL == ctran->label) || (NULL != ctran->to_state) )
  2656. continue;
  2657. nfa_closure_set_create (&tmp, nfa, &dfa_state->nfa_set, ctran->label);
  2658. nfa_closure_set_create (&nfa_set, nfa, &tmp, NULL);
  2659. state_set_clear (&tmp);
  2660. state_contains = NULL;
  2661. for (state_iter = dfa->states_head; NULL != state_iter;
  2662. state_iter = state_iter->next)
  2663. {
  2664. if (0 == state_set_compare (&state_iter->nfa_set, &nfa_set))
  2665. {
  2666. state_contains = state_iter;
  2667. break;
  2668. }
  2669. }
  2670. if (NULL == state_contains)
  2671. {
  2672. new_dfa_state = dfa_state_create (ctx, &nfa_set);
  2673. automaton_add_state (dfa, new_dfa_state);
  2674. ctran->to_state = new_dfa_state;
  2675. construct_dfa_states (ctx, nfa, dfa, new_dfa_state);
  2676. }
  2677. else
  2678. {
  2679. ctran->to_state = state_contains;
  2680. state_set_clear (&nfa_set);
  2681. }
  2682. }
  2683. }
  2684. /**
  2685. * Construct DFA for the given 'regex' of length 'len'.
  2686. *
  2687. * Path compression means, that for example a DFA o -> a -> b -> c -> o will be
  2688. * compressed to o -> abc -> o. Note that this parameter influences the
  2689. * non-determinism of states of the resulting NFA in the DHT (number of outgoing
  2690. * edges with the same label). For example for an application that stores IPv4
  2691. * addresses as bitstrings it could make sense to limit the path compression to
  2692. * 4 or 8.
  2693. *
  2694. * @param regex regular expression string.
  2695. * @param len length of the regular expression.
  2696. * @param max_path_len limit the path compression length to the
  2697. * given value. If set to 1, no path compression is applied. Set to 0 for
  2698. * maximal possible path compression (generally not desirable).
  2699. * @return DFA, needs to be freed using REGEX_INTERNAL_automaton_destroy.
  2700. */
  2701. struct REGEX_INTERNAL_Automaton *
  2702. REGEX_INTERNAL_construct_dfa (const char *regex,
  2703. const size_t len,
  2704. unsigned int max_path_len)
  2705. {
  2706. struct REGEX_INTERNAL_Context ctx;
  2707. struct REGEX_INTERNAL_Automaton *dfa;
  2708. struct REGEX_INTERNAL_Automaton *nfa;
  2709. struct REGEX_INTERNAL_StateSet nfa_start_eps_cls;
  2710. struct REGEX_INTERNAL_StateSet singleton_set;
  2711. REGEX_INTERNAL_context_init (&ctx);
  2712. /* Create NFA */
  2713. nfa = REGEX_INTERNAL_construct_nfa (regex, len);
  2714. if (NULL == nfa)
  2715. {
  2716. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2717. "Could not create DFA, because NFA creation failed\n");
  2718. return NULL;
  2719. }
  2720. dfa = GNUNET_new (struct REGEX_INTERNAL_Automaton);
  2721. dfa->type = DFA;
  2722. dfa->regex = GNUNET_strdup (regex);
  2723. /* Create DFA start state from epsilon closure */
  2724. memset (&singleton_set, 0, sizeof(struct REGEX_INTERNAL_StateSet));
  2725. state_set_append (&singleton_set, nfa->start);
  2726. nfa_closure_set_create (&nfa_start_eps_cls, nfa, &singleton_set, NULL);
  2727. state_set_clear (&singleton_set);
  2728. dfa->start = dfa_state_create (&ctx, &nfa_start_eps_cls);
  2729. automaton_add_state (dfa, dfa->start);
  2730. construct_dfa_states (&ctx, nfa, dfa, dfa->start);
  2731. REGEX_INTERNAL_automaton_destroy (nfa);
  2732. /* Minimize DFA */
  2733. if (GNUNET_OK != dfa_minimize (&ctx, dfa))
  2734. {
  2735. REGEX_INTERNAL_automaton_destroy (dfa);
  2736. return NULL;
  2737. }
  2738. /* Create proofs and hashes for all states */
  2739. if (GNUNET_OK != automaton_create_proofs (dfa))
  2740. {
  2741. REGEX_INTERNAL_automaton_destroy (dfa);
  2742. return NULL;
  2743. }
  2744. /* Compress linear DFA paths */
  2745. if (1 != max_path_len)
  2746. dfa_compress_paths (&ctx, dfa, max_path_len);
  2747. return dfa;
  2748. }
  2749. /**
  2750. * Free the memory allocated by constructing the REGEX_INTERNAL_Automaton data
  2751. * structure.
  2752. *
  2753. * @param a automaton to be destroyed
  2754. */
  2755. void
  2756. REGEX_INTERNAL_automaton_destroy (struct REGEX_INTERNAL_Automaton *a)
  2757. {
  2758. struct REGEX_INTERNAL_State *s;
  2759. struct REGEX_INTERNAL_State *next_state;
  2760. if (NULL == a)
  2761. return;
  2762. GNUNET_free (a->regex);
  2763. GNUNET_free (a->canonical_regex);
  2764. for (s = a->states_head; NULL != s; s = next_state)
  2765. {
  2766. next_state = s->next;
  2767. GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s);
  2768. automaton_destroy_state (s);
  2769. }
  2770. GNUNET_free (a);
  2771. }
  2772. /**
  2773. * Evaluates the given string using the given DFA automaton
  2774. *
  2775. * @param a automaton, type must be DFA
  2776. * @param string string that should be evaluated
  2777. *
  2778. * @return 0 if string matches, non-0 otherwise
  2779. */
  2780. static int
  2781. evaluate_dfa (struct REGEX_INTERNAL_Automaton *a, const char *string)
  2782. {
  2783. const char *strp;
  2784. struct REGEX_INTERNAL_State *s;
  2785. unsigned int step_len;
  2786. if (DFA != a->type)
  2787. {
  2788. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2789. "Tried to evaluate DFA, but NFA automaton given");
  2790. return -1;
  2791. }
  2792. s = a->start;
  2793. /* If the string is empty but the starting state is accepting, we accept. */
  2794. if (((NULL == string) || (0 == strlen (string))) && s->accepting)
  2795. return 0;
  2796. for (strp = string; NULL != strp && *strp; strp += step_len)
  2797. {
  2798. step_len = dfa_move (&s, strp);
  2799. if (NULL == s)
  2800. break;
  2801. }
  2802. if ((NULL != s) && s->accepting)
  2803. return 0;
  2804. return 1;
  2805. }
  2806. /**
  2807. * Evaluates the given string using the given NFA automaton
  2808. *
  2809. * @param a automaton, type must be NFA
  2810. * @param string string that should be evaluated
  2811. * @return 0 if string matches, non-0 otherwise
  2812. */
  2813. static int
  2814. evaluate_nfa (struct REGEX_INTERNAL_Automaton *a, const char *string)
  2815. {
  2816. const char *strp;
  2817. char str[2];
  2818. struct REGEX_INTERNAL_State *s;
  2819. struct REGEX_INTERNAL_StateSet sset;
  2820. struct REGEX_INTERNAL_StateSet new_sset;
  2821. struct REGEX_INTERNAL_StateSet singleton_set;
  2822. unsigned int i;
  2823. int result;
  2824. if (NFA != a->type)
  2825. {
  2826. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2827. "Tried to evaluate NFA, but DFA automaton given");
  2828. return -1;
  2829. }
  2830. /* If the string is empty but the starting state is accepting, we accept. */
  2831. if (((NULL == string) || (0 == strlen (string))) && a->start->accepting)
  2832. return 0;
  2833. result = 1;
  2834. memset (&singleton_set, 0, sizeof(struct REGEX_INTERNAL_StateSet));
  2835. state_set_append (&singleton_set, a->start);
  2836. nfa_closure_set_create (&sset, a, &singleton_set, NULL);
  2837. state_set_clear (&singleton_set);
  2838. str[1] = '\0';
  2839. for (strp = string; NULL != strp && *strp; strp++)
  2840. {
  2841. str[0] = *strp;
  2842. nfa_closure_set_create (&new_sset, a, &sset, str);
  2843. state_set_clear (&sset);
  2844. nfa_closure_set_create (&sset, a, &new_sset, 0);
  2845. state_set_clear (&new_sset);
  2846. }
  2847. for (i = 0; i < sset.off; i++)
  2848. {
  2849. s = sset.states[i];
  2850. if ((NULL != s) && (s->accepting))
  2851. {
  2852. result = 0;
  2853. break;
  2854. }
  2855. }
  2856. state_set_clear (&sset);
  2857. return result;
  2858. }
  2859. /**
  2860. * Evaluates the given @a string against the given compiled regex @a a
  2861. *
  2862. * @param a automaton
  2863. * @param string string to check
  2864. * @return 0 if string matches, non-0 otherwise
  2865. */
  2866. int
  2867. REGEX_INTERNAL_eval (struct REGEX_INTERNAL_Automaton *a, const char *string)
  2868. {
  2869. int result;
  2870. switch (a->type)
  2871. {
  2872. case DFA:
  2873. result = evaluate_dfa (a, string);
  2874. break;
  2875. case NFA:
  2876. result = evaluate_nfa (a, string);
  2877. break;
  2878. default:
  2879. GNUNET_log (GNUNET_ERROR_TYPE_ERROR,
  2880. "Evaluating regex failed, automaton has no type!\n");
  2881. result = GNUNET_SYSERR;
  2882. break;
  2883. }
  2884. return result;
  2885. }
  2886. /**
  2887. * Get the canonical regex of the given automaton.
  2888. * When constructing the automaton a proof is computed for each state,
  2889. * consisting of the regular expression leading to this state. A complete
  2890. * regex for the automaton can be computed by combining these proofs.
  2891. * As of now this function is only useful for testing.
  2892. *
  2893. * @param a automaton for which the canonical regex should be returned.
  2894. *
  2895. * @return
  2896. */
  2897. const char *
  2898. REGEX_INTERNAL_get_canonical_regex (struct REGEX_INTERNAL_Automaton *a)
  2899. {
  2900. if (NULL == a)
  2901. return NULL;
  2902. return a->canonical_regex;
  2903. }
  2904. /**
  2905. * Get the number of transitions that are contained in the given automaton.
  2906. *
  2907. * @param a automaton for which the number of transitions should be returned.
  2908. *
  2909. * @return number of transitions in the given automaton.
  2910. */
  2911. unsigned int
  2912. REGEX_INTERNAL_get_transition_count (struct REGEX_INTERNAL_Automaton *a)
  2913. {
  2914. unsigned int t_count;
  2915. struct REGEX_INTERNAL_State *s;
  2916. if (NULL == a)
  2917. return 0;
  2918. t_count = 0;
  2919. for (s = a->states_head; NULL != s; s = s->next)
  2920. t_count += s->transition_count;
  2921. return t_count;
  2922. }
  2923. /**
  2924. * Get the first key for the given @a input_string. This hashes the first x bits
  2925. * of the @a input_string.
  2926. *
  2927. * @param input_string string.
  2928. * @param string_len length of the @a input_string.
  2929. * @param key pointer to where to write the hash code.
  2930. * @return number of bits of @a input_string that have been consumed
  2931. * to construct the key
  2932. */
  2933. size_t
  2934. REGEX_INTERNAL_get_first_key (const char *input_string,
  2935. size_t string_len,
  2936. struct GNUNET_HashCode *key)
  2937. {
  2938. size_t size;
  2939. size = string_len < GNUNET_REGEX_INITIAL_BYTES ? string_len
  2940. : GNUNET_REGEX_INITIAL_BYTES;
  2941. if (NULL == input_string)
  2942. {
  2943. GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Given input string was NULL!\n");
  2944. return 0;
  2945. }
  2946. GNUNET_CRYPTO_hash (input_string, size, key);
  2947. return size;
  2948. }
  2949. /**
  2950. * Recursive function that calls the iterator for each synthetic start state.
  2951. *
  2952. * @param min_len minimum length of the path in the graph.
  2953. * @param max_len maximum length of the path in the graph.
  2954. * @param consumed_string string consumed by traversing the graph till this state.
  2955. * @param state current state of the automaton.
  2956. * @param iterator iterator function called for each edge.
  2957. * @param iterator_cls closure for the @a iterator function.
  2958. */
  2959. static void
  2960. iterate_initial_edge (unsigned int min_len,
  2961. unsigned int max_len,
  2962. char *consumed_string,
  2963. struct REGEX_INTERNAL_State *state,
  2964. REGEX_INTERNAL_KeyIterator iterator,
  2965. void *iterator_cls)
  2966. {
  2967. char *temp;
  2968. struct REGEX_INTERNAL_Transition *t;
  2969. unsigned int num_edges = state->transition_count;
  2970. struct REGEX_BLOCK_Edge edges[num_edges];
  2971. struct REGEX_BLOCK_Edge edge[1];
  2972. struct GNUNET_HashCode hash;
  2973. struct GNUNET_HashCode hash_new;
  2974. unsigned int cur_len;
  2975. if (NULL != consumed_string)
  2976. cur_len = strlen (consumed_string);
  2977. else
  2978. cur_len = 0;
  2979. if (((cur_len >= min_len) || (GNUNET_YES == state->accepting)) &&
  2980. (cur_len > 0) && (NULL != consumed_string))
  2981. {
  2982. if (cur_len <= max_len)
  2983. {
  2984. if ((NULL != state->proof) &&
  2985. (0 != strcmp (consumed_string, state->proof)))
  2986. {
  2987. (void) state_get_edges (state, edges);
  2988. GNUNET_CRYPTO_hash (consumed_string, strlen (consumed_string), &hash);
  2989. GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
  2990. "Start state for string `%s' is %s\n",
  2991. consumed_string,
  2992. GNUNET_h2s (&hash));
  2993. iterator (iterator_cls,
  2994. &hash,
  2995. consumed_string,
  2996. state->accepting,
  2997. num_edges,
  2998. edges);
  2999. }
  3000. if ((GNUNET_YES == state->accepting) && (cur_len > 1) &&
  3001. (state->transition_count < 1) && (cur_len < max_len))
  3002. {
  3003. /* Special case for regex consisting of just a string that is shorter than
  3004. * max_len */
  3005. edge[0].label = &consumed_string[cur_len - 1];
  3006. edge[0].destination = state->hash;
  3007. temp = GNUNET_strdup (consumed_string);
  3008. temp[cur_len - 1] = '\0';
  3009. GNUNET_CRYPTO_hash (temp, cur_len - 1, &hash_new);
  3010. GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
  3011. "Start state for short string `%s' is %s\n",
  3012. temp,
  3013. GNUNET_h2s (&hash_new));
  3014. iterator (iterator_cls, &hash_new, temp, GNUNET_NO, 1, edge);
  3015. GNUNET_free (temp);
  3016. }
  3017. }
  3018. else /* cur_len > max_len */
  3019. {
  3020. /* Case where the concatenated labels are longer than max_len, then split. */
  3021. edge[0].label = &consumed_string[max_len];
  3022. edge[0].destination = state->hash;
  3023. temp = GNUNET_strdup (consumed_string);
  3024. temp[max_len] = '\0';
  3025. GNUNET_CRYPTO_hash (temp, max_len, &hash);
  3026. GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
  3027. "Start state at split edge `%s'-`%s` is %s\n",
  3028. temp,
  3029. edge[0].label,
  3030. GNUNET_h2s (&hash_new));
  3031. iterator (iterator_cls, &hash, temp, GNUNET_NO, 1, edge);
  3032. GNUNET_free (temp);
  3033. }
  3034. }
  3035. if (cur_len < max_len)
  3036. {
  3037. for (t = state->transitions_head; NULL != t; t = t->next)
  3038. {
  3039. if (NULL != strchr (t->label, (int) '.'))
  3040. {
  3041. /* Wildcards not allowed during starting states */
  3042. GNUNET_break (0);
  3043. continue;
  3044. }
  3045. if (NULL != consumed_string)
  3046. GNUNET_asprintf (&temp, "%s%s", consumed_string, t->label);
  3047. else
  3048. GNUNET_asprintf (&temp, "%s", t->label);
  3049. iterate_initial_edge (min_len,
  3050. max_len,
  3051. temp,
  3052. t->to_state,
  3053. iterator,
  3054. iterator_cls);
  3055. GNUNET_free (temp);
  3056. }
  3057. }
  3058. }
  3059. /**
  3060. * Iterate over all edges starting from start state of automaton 'a'. Calling
  3061. * iterator for each edge.
  3062. *
  3063. * @param a automaton.
  3064. * @param iterator iterator called for each edge.
  3065. * @param iterator_cls closure.
  3066. */
  3067. void
  3068. REGEX_INTERNAL_iterate_all_edges (struct REGEX_INTERNAL_Automaton *a,
  3069. REGEX_INTERNAL_KeyIterator iterator,
  3070. void *iterator_cls)
  3071. {
  3072. struct REGEX_INTERNAL_State *s;
  3073. GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Iterating over starting edges\n");
  3074. iterate_initial_edge (GNUNET_REGEX_INITIAL_BYTES,
  3075. GNUNET_REGEX_INITIAL_BYTES,
  3076. NULL,
  3077. a->start,
  3078. iterator,
  3079. iterator_cls);
  3080. GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Iterating over DFA edges\n");
  3081. for (s = a->states_head; NULL != s; s = s->next)
  3082. {
  3083. struct REGEX_BLOCK_Edge edges[s->transition_count];
  3084. unsigned int num_edges;
  3085. num_edges = state_get_edges (s, edges);
  3086. if (((NULL != s->proof) && (0 < strlen (s->proof))) || s->accepting)
  3087. {
  3088. GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
  3089. "Creating DFA edges at `%s' under key %s\n",
  3090. s->proof,
  3091. GNUNET_h2s (&s->hash));
  3092. iterator (iterator_cls,
  3093. &s->hash,
  3094. s->proof,
  3095. s->accepting,
  3096. num_edges,
  3097. edges);
  3098. }
  3099. s->marked = GNUNET_NO;
  3100. }
  3101. }
  3102. /**
  3103. * Struct to hold all the relevant state information in the HashMap.
  3104. *
  3105. * Contains the same info as the Regex Iterator parameters except the key,
  3106. * which comes directly from the HashMap iterator.
  3107. */
  3108. struct temporal_state_store
  3109. {
  3110. int reachable;
  3111. char *proof;
  3112. int accepting;
  3113. int num_edges;
  3114. struct REGEX_BLOCK_Edge *edges;
  3115. };
  3116. /**
  3117. * Store regex iterator and cls in one place to pass to the hashmap iterator.
  3118. */
  3119. struct client_iterator
  3120. {
  3121. REGEX_INTERNAL_KeyIterator iterator;
  3122. void *iterator_cls;
  3123. };
  3124. /**
  3125. * Iterator over all edges of a dfa. Stores all of them in a HashMap
  3126. * for later reachability marking.
  3127. *
  3128. * @param cls Closure (HashMap)
  3129. * @param key hash for current state.
  3130. * @param proof proof for current state
  3131. * @param accepting GNUNET_YES if this is an accepting state, GNUNET_NO if not.
  3132. * @param num_edges number of edges leaving current state.
  3133. * @param edges edges leaving current state.
  3134. */
  3135. static void
  3136. store_all_states (void *cls,
  3137. const struct GNUNET_HashCode *key,
  3138. const char *proof,
  3139. int accepting,
  3140. unsigned int num_edges,
  3141. const struct REGEX_BLOCK_Edge *edges)
  3142. {
  3143. struct GNUNET_CONTAINER_MultiHashMap *hm = cls;
  3144. struct temporal_state_store *tmp;
  3145. size_t edges_size;
  3146. tmp = GNUNET_new (struct temporal_state_store);
  3147. tmp->reachable = GNUNET_NO;
  3148. tmp->proof = GNUNET_strdup (proof);
  3149. tmp->accepting = accepting;
  3150. tmp->num_edges = num_edges;
  3151. edges_size = sizeof(struct REGEX_BLOCK_Edge) * num_edges;
  3152. tmp->edges = GNUNET_malloc (edges_size);
  3153. GNUNET_memcpy (tmp->edges, edges, edges_size);
  3154. GNUNET_assert (GNUNET_YES ==
  3155. GNUNET_CONTAINER_multihashmap_put (
  3156. hm,
  3157. key,
  3158. tmp,
  3159. GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_FAST));
  3160. }
  3161. /**
  3162. * Mark state as reachable and call recursively on all its edges.
  3163. *
  3164. * If already marked as reachable, do nothing.
  3165. *
  3166. * @param state State to mark as reachable.
  3167. * @param hm HashMap which stores all the states indexed by key.
  3168. */
  3169. static void
  3170. mark_as_reachable (struct temporal_state_store *state,
  3171. struct GNUNET_CONTAINER_MultiHashMap *hm)
  3172. {
  3173. struct temporal_state_store *child;
  3174. unsigned int i;
  3175. if (GNUNET_YES == state->reachable)
  3176. /* visited */
  3177. return;
  3178. state->reachable = GNUNET_YES;
  3179. for (i = 0; i < state->num_edges; i++)
  3180. {
  3181. child =
  3182. GNUNET_CONTAINER_multihashmap_get (hm, &state->edges[i].destination);
  3183. if (NULL == child)
  3184. {
  3185. GNUNET_break (0);
  3186. continue;
  3187. }
  3188. mark_as_reachable (child, hm);
  3189. }
  3190. }
  3191. /**
  3192. * Iterator over hash map entries to mark the ones that are reachable.
  3193. *
  3194. * @param cls closure
  3195. * @param key current key code
  3196. * @param value value in the hash map
  3197. * @return #GNUNET_YES if we should continue to iterate,
  3198. * #GNUNET_NO if not.
  3199. */
  3200. static int
  3201. reachability_iterator (void *cls,
  3202. const struct GNUNET_HashCode *key,
  3203. void *value)
  3204. {
  3205. struct GNUNET_CONTAINER_MultiHashMap *hm = cls;
  3206. struct temporal_state_store *state = value;
  3207. if (GNUNET_YES == state->reachable)
  3208. /* already visited and marked */
  3209. return GNUNET_YES;
  3210. if ((GNUNET_REGEX_INITIAL_BYTES > strlen (state->proof)) &&
  3211. (GNUNET_NO == state->accepting) )
  3212. /* not directly reachable */
  3213. return GNUNET_YES;
  3214. mark_as_reachable (state, hm);
  3215. return GNUNET_YES;
  3216. }
  3217. /**
  3218. * Iterator over hash map entries.
  3219. * Calling the callback on the ones marked as reachables.
  3220. *
  3221. * @param cls closure
  3222. * @param key current key code
  3223. * @param value value in the hash map
  3224. * @return #GNUNET_YES if we should continue to iterate,
  3225. * #GNUNET_NO if not.
  3226. */
  3227. static int
  3228. iterate_reachables (void *cls, const struct GNUNET_HashCode *key, void *value)
  3229. {
  3230. struct client_iterator *ci = cls;
  3231. struct temporal_state_store *state = value;
  3232. if (GNUNET_YES == state->reachable)
  3233. {
  3234. ci->iterator (ci->iterator_cls,
  3235. key,
  3236. state->proof,
  3237. state->accepting,
  3238. state->num_edges,
  3239. state->edges);
  3240. }
  3241. GNUNET_free (state->edges);
  3242. GNUNET_free (state->proof);
  3243. GNUNET_free (state);
  3244. return GNUNET_YES;
  3245. }
  3246. /**
  3247. * Iterate over all edges of automaton 'a' that are reachable from a state with
  3248. * a proof of at least GNUNET_REGEX_INITIAL_BYTES characters.
  3249. *
  3250. * Call the iterator for each such edge.
  3251. *
  3252. * @param a automaton.
  3253. * @param iterator iterator called for each reachable edge.
  3254. * @param iterator_cls closure.
  3255. */
  3256. void
  3257. REGEX_INTERNAL_iterate_reachable_edges (struct REGEX_INTERNAL_Automaton *a,
  3258. REGEX_INTERNAL_KeyIterator iterator,
  3259. void *iterator_cls)
  3260. {
  3261. struct GNUNET_CONTAINER_MultiHashMap *hm;
  3262. struct client_iterator ci;
  3263. hm = GNUNET_CONTAINER_multihashmap_create (a->state_count * 2, GNUNET_NO);
  3264. ci.iterator = iterator;
  3265. ci.iterator_cls = iterator_cls;
  3266. REGEX_INTERNAL_iterate_all_edges (a, &store_all_states, hm);
  3267. GNUNET_CONTAINER_multihashmap_iterate (hm, &reachability_iterator, hm);
  3268. GNUNET_CONTAINER_multihashmap_iterate (hm, &iterate_reachables, &ci);
  3269. GNUNET_CONTAINER_multihashmap_destroy (hm);
  3270. }
  3271. /* end of regex_internal.c */