irrUString.h 95 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891
  1. /*
  2. Basic Unicode string class for Irrlicht.
  3. Copyright (c) 2009-2011 John Norman
  4. This software is provided 'as-is', without any express or implied
  5. warranty. In no event will the authors be held liable for any
  6. damages arising from the use of this software.
  7. Permission is granted to anyone to use this software for any
  8. purpose, including commercial applications, and to alter it and
  9. redistribute it freely, subject to the following restrictions:
  10. 1. The origin of this software must not be misrepresented; you
  11. must not claim that you wrote the original software. If you use
  12. this software in a product, an acknowledgment in the product
  13. documentation would be appreciated but is not required.
  14. 2. Altered source versions must be plainly marked as such, and
  15. must not be misrepresented as being the original software.
  16. 3. This notice may not be removed or altered from any source
  17. distribution.
  18. The original version of this class can be located at:
  19. http://irrlicht.suckerfreegames.com/
  20. John Norman
  21. john@suckerfreegames.com
  22. */
  23. #pragma once
  24. #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
  25. # define USTRING_CPP0X
  26. # if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
  27. # define USTRING_CPP0X_NEWLITERALS
  28. # endif
  29. #endif
  30. #include <stdio.h>
  31. #include <string.h>
  32. #include <stdlib.h>
  33. #include <cstddef>
  34. #ifdef _WIN32
  35. #define __BYTE_ORDER 0
  36. #define __LITTLE_ENDIAN 0
  37. #define __BIG_ENDIAN 1
  38. #elif defined(__MACH__) && defined(__APPLE__)
  39. #include <machine/endian.h>
  40. #elif defined(__FreeBSD__) || defined(__DragonFly__)
  41. #include <sys/endian.h>
  42. #else
  43. #include <endian.h>
  44. #endif
  45. #ifdef USTRING_CPP0X
  46. # include <utility>
  47. #endif
  48. #ifndef USTRING_NO_STL
  49. # include <string>
  50. # include <iterator>
  51. # include <ostream>
  52. #endif
  53. #include "irrTypes.h"
  54. #include "irrAllocator.h"
  55. #include "irrArray.h"
  56. #include "irrMath.h"
  57. #include "irrString.h"
  58. #include "path.h"
  59. //! UTF-16 surrogate start values.
  60. static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
  61. static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
  62. //! Is a UTF-16 code point a surrogate?
  63. #define UTF16_IS_SURROGATE(c) (((c) & 0xF800) == 0xD800)
  64. #define UTF16_IS_SURROGATE_HI(c) (((c) & 0xFC00) == 0xD800)
  65. #define UTF16_IS_SURROGATE_LO(c) (((c) & 0xFC00) == 0xDC00)
  66. namespace irr
  67. {
  68. // Define our character types.
  69. #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
  70. typedef char32_t uchar32_t;
  71. typedef char16_t uchar16_t;
  72. typedef char uchar8_t;
  73. #else
  74. typedef u32 uchar32_t;
  75. typedef u16 uchar16_t;
  76. typedef u8 uchar8_t;
  77. #endif
  78. namespace core
  79. {
  80. namespace unicode
  81. {
  82. //! The unicode replacement character. Used to replace invalid characters.
  83. const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
  84. //! Convert a UTF-16 surrogate pair into a UTF-32 character.
  85. //! \param high The high value of the pair.
  86. //! \param low The low value of the pair.
  87. //! \return The UTF-32 character expressed by the surrogate pair.
  88. inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
  89. {
  90. // Convert the surrogate pair into a single UTF-32 character.
  91. uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
  92. uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
  93. return (wu << 16) | x;
  94. }
  95. //! Swaps the endianness of a 16-bit value.
  96. //! \return The new value.
  97. inline uchar16_t swapEndian16(const uchar16_t& c)
  98. {
  99. return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
  100. }
  101. //! Swaps the endianness of a 32-bit value.
  102. //! \return The new value.
  103. inline uchar32_t swapEndian32(const uchar32_t& c)
  104. {
  105. return ((c >> 24) & 0x000000FF) |
  106. ((c >> 8) & 0x0000FF00) |
  107. ((c << 8) & 0x00FF0000) |
  108. ((c << 24) & 0xFF000000);
  109. }
  110. //! The Unicode byte order mark.
  111. const u16 BOM = 0xFEFF;
  112. //! The size of the Unicode byte order mark in terms of the Unicode character size.
  113. const u8 BOM_UTF8_LEN = 3;
  114. const u8 BOM_UTF16_LEN = 1;
  115. const u8 BOM_UTF32_LEN = 1;
  116. //! Unicode byte order marks for file operations.
  117. const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
  118. const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
  119. const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
  120. const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
  121. const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
  122. //! The size in bytes of the Unicode byte marks for file operations.
  123. const u8 BOM_ENCODE_UTF8_LEN = 3;
  124. const u8 BOM_ENCODE_UTF16_LEN = 2;
  125. const u8 BOM_ENCODE_UTF32_LEN = 4;
  126. //! Unicode encoding type.
  127. enum EUTF_ENCODE
  128. {
  129. EUTFE_NONE = 0,
  130. EUTFE_UTF8,
  131. EUTFE_UTF16,
  132. EUTFE_UTF16_LE,
  133. EUTFE_UTF16_BE,
  134. EUTFE_UTF32,
  135. EUTFE_UTF32_LE,
  136. EUTFE_UTF32_BE
  137. };
  138. //! Unicode endianness.
  139. enum EUTF_ENDIAN
  140. {
  141. EUTFEE_NATIVE = 0,
  142. EUTFEE_LITTLE,
  143. EUTFEE_BIG
  144. };
  145. //! Returns the specified unicode byte order mark in a byte array.
  146. //! The byte order mark is the first few bytes in a text file that signifies its encoding.
  147. /** \param mode The Unicode encoding method that we want to get the byte order mark for.
  148. If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
  149. //! \return An array that contains a byte order mark.
  150. inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
  151. {
  152. #define COPY_ARRAY(source, size) \
  153. memcpy(ret.pointer(), source, size); \
  154. ret.set_used(size)
  155. core::array<u8> ret(4);
  156. switch (mode)
  157. {
  158. case EUTFE_UTF8:
  159. COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
  160. break;
  161. case EUTFE_UTF16:
  162. #ifdef __BIG_ENDIAN__
  163. COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
  164. #else
  165. COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
  166. #endif
  167. break;
  168. case EUTFE_UTF16_BE:
  169. COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
  170. break;
  171. case EUTFE_UTF16_LE:
  172. COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
  173. break;
  174. case EUTFE_UTF32:
  175. #ifdef __BIG_ENDIAN__
  176. COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
  177. #else
  178. COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
  179. #endif
  180. break;
  181. case EUTFE_UTF32_BE:
  182. COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
  183. break;
  184. case EUTFE_UTF32_LE:
  185. COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
  186. break;
  187. case EUTFE_NONE:
  188. // TODO sapier: fixed warning only,
  189. // don't know if something needs to be done here
  190. break;
  191. }
  192. return ret;
  193. #undef COPY_ARRAY
  194. }
  195. //! Detects if the given data stream starts with a unicode BOM.
  196. //! \param data The data stream to check.
  197. //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
  198. inline EUTF_ENCODE determineUnicodeBOM(const char* data)
  199. {
  200. if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
  201. if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
  202. if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
  203. if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
  204. if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
  205. return EUTFE_NONE;
  206. }
  207. } // end namespace unicode
  208. //! UTF-16 string class.
  209. template <typename TAlloc = irrAllocator<uchar16_t> >
  210. class ustring16
  211. {
  212. public:
  213. ///------------------///
  214. /// iterator classes ///
  215. ///------------------///
  216. //! Access an element in a unicode string, allowing one to change it.
  217. class _ustring16_iterator_access
  218. {
  219. public:
  220. _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
  221. //! Allow the class to be interpreted as a single UTF-32 character.
  222. operator uchar32_t() const
  223. {
  224. return _get();
  225. }
  226. //! Allow one to change the character in the unicode string.
  227. //! \param c The new character to use.
  228. //! \return Myself.
  229. _ustring16_iterator_access& operator=(const uchar32_t c)
  230. {
  231. _set(c);
  232. return *this;
  233. }
  234. //! Increments the value by 1.
  235. //! \return Myself.
  236. _ustring16_iterator_access& operator++()
  237. {
  238. _set(_get() + 1);
  239. return *this;
  240. }
  241. //! Increments the value by 1, returning the old value.
  242. //! \return A unicode character.
  243. uchar32_t operator++(int)
  244. {
  245. uchar32_t old = _get();
  246. _set(old + 1);
  247. return old;
  248. }
  249. //! Decrements the value by 1.
  250. //! \return Myself.
  251. _ustring16_iterator_access& operator--()
  252. {
  253. _set(_get() - 1);
  254. return *this;
  255. }
  256. //! Decrements the value by 1, returning the old value.
  257. //! \return A unicode character.
  258. uchar32_t operator--(int)
  259. {
  260. uchar32_t old = _get();
  261. _set(old - 1);
  262. return old;
  263. }
  264. //! Adds to the value by a specified amount.
  265. //! \param val The amount to add to this character.
  266. //! \return Myself.
  267. _ustring16_iterator_access& operator+=(int val)
  268. {
  269. _set(_get() + val);
  270. return *this;
  271. }
  272. //! Subtracts from the value by a specified amount.
  273. //! \param val The amount to subtract from this character.
  274. //! \return Myself.
  275. _ustring16_iterator_access& operator-=(int val)
  276. {
  277. _set(_get() - val);
  278. return *this;
  279. }
  280. //! Multiples the value by a specified amount.
  281. //! \param val The amount to multiply this character by.
  282. //! \return Myself.
  283. _ustring16_iterator_access& operator*=(int val)
  284. {
  285. _set(_get() * val);
  286. return *this;
  287. }
  288. //! Divides the value by a specified amount.
  289. //! \param val The amount to divide this character by.
  290. //! \return Myself.
  291. _ustring16_iterator_access& operator/=(int val)
  292. {
  293. _set(_get() / val);
  294. return *this;
  295. }
  296. //! Modulos the value by a specified amount.
  297. //! \param val The amount to modulo this character by.
  298. //! \return Myself.
  299. _ustring16_iterator_access& operator%=(int val)
  300. {
  301. _set(_get() % val);
  302. return *this;
  303. }
  304. //! Adds to the value by a specified amount.
  305. //! \param val The amount to add to this character.
  306. //! \return A unicode character.
  307. uchar32_t operator+(int val) const
  308. {
  309. return _get() + val;
  310. }
  311. //! Subtracts from the value by a specified amount.
  312. //! \param val The amount to subtract from this character.
  313. //! \return A unicode character.
  314. uchar32_t operator-(int val) const
  315. {
  316. return _get() - val;
  317. }
  318. //! Multiplies the value by a specified amount.
  319. //! \param val The amount to multiply this character by.
  320. //! \return A unicode character.
  321. uchar32_t operator*(int val) const
  322. {
  323. return _get() * val;
  324. }
  325. //! Divides the value by a specified amount.
  326. //! \param val The amount to divide this character by.
  327. //! \return A unicode character.
  328. uchar32_t operator/(int val) const
  329. {
  330. return _get() / val;
  331. }
  332. //! Modulos the value by a specified amount.
  333. //! \param val The amount to modulo this character by.
  334. //! \return A unicode character.
  335. uchar32_t operator%(int val) const
  336. {
  337. return _get() % val;
  338. }
  339. private:
  340. //! Gets a uchar32_t from our current position.
  341. uchar32_t _get() const
  342. {
  343. const uchar16_t* a = ref->c_str();
  344. if (!UTF16_IS_SURROGATE(a[pos]))
  345. return static_cast<uchar32_t>(a[pos]);
  346. else
  347. {
  348. if (pos + 1 >= ref->size_raw())
  349. return 0;
  350. return unicode::toUTF32(a[pos], a[pos + 1]);
  351. }
  352. }
  353. //! Sets a uchar32_t at our current position.
  354. void _set(uchar32_t c)
  355. {
  356. ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
  357. const uchar16_t* a = ref2->c_str();
  358. if (c > 0xFFFF)
  359. {
  360. // c will be multibyte, so split it up into the high and low surrogate pairs.
  361. uchar16_t x = static_cast<uchar16_t>(c);
  362. uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
  363. uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
  364. // If the previous position was a surrogate pair, just replace them. Else, insert the low pair.
  365. if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
  366. ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
  367. else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
  368. ref2->replace_raw(vh, static_cast<u32>(pos));
  369. }
  370. else
  371. {
  372. // c will be a single byte.
  373. uchar16_t vh = static_cast<uchar16_t>(c);
  374. // If the previous position was a surrogate pair, remove the extra byte.
  375. if (UTF16_IS_SURROGATE_HI(a[pos]))
  376. ref2->erase_raw(static_cast<u32>(pos) + 1);
  377. ref2->replace_raw(vh, static_cast<u32>(pos));
  378. }
  379. }
  380. const ustring16<TAlloc>* ref;
  381. u32 pos;
  382. };
  383. typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
  384. //! Iterator to iterate through a UTF-16 string.
  385. #ifndef USTRING_NO_STL
  386. class _ustring16_const_iterator : public std::iterator<
  387. std::bidirectional_iterator_tag, // iterator_category
  388. access, // value_type
  389. ptrdiff_t, // difference_type
  390. const access, // pointer
  391. const access // reference
  392. >
  393. #else
  394. class _ustring16_const_iterator
  395. #endif
  396. {
  397. public:
  398. typedef _ustring16_const_iterator _Iter;
  399. typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
  400. typedef const access const_pointer;
  401. typedef const access const_reference;
  402. #ifndef USTRING_NO_STL
  403. typedef typename _Base::value_type value_type;
  404. typedef typename _Base::difference_type difference_type;
  405. typedef typename _Base::difference_type distance_type;
  406. typedef typename _Base::pointer pointer;
  407. typedef const_reference reference;
  408. #else
  409. typedef access value_type;
  410. typedef u32 difference_type;
  411. typedef u32 distance_type;
  412. typedef const_pointer pointer;
  413. typedef const_reference reference;
  414. #endif
  415. //! Constructors.
  416. _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
  417. _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
  418. _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
  419. {
  420. if (ref->size_raw() == 0 || p == 0)
  421. return;
  422. // Go to the appropriate position.
  423. u32 i = p;
  424. u32 sr = ref->size_raw();
  425. const uchar16_t* a = ref->c_str();
  426. while (i != 0 && pos < sr)
  427. {
  428. if (UTF16_IS_SURROGATE_HI(a[pos]))
  429. pos += 2;
  430. else ++pos;
  431. --i;
  432. }
  433. }
  434. //! Test for equalness.
  435. bool operator==(const _Iter& iter) const
  436. {
  437. if (ref == iter.ref && pos == iter.pos)
  438. return true;
  439. return false;
  440. }
  441. //! Test for unequalness.
  442. bool operator!=(const _Iter& iter) const
  443. {
  444. if (ref != iter.ref || pos != iter.pos)
  445. return true;
  446. return false;
  447. }
  448. //! Switch to the next full character in the string.
  449. _Iter& operator++()
  450. { // ++iterator
  451. if (pos == ref->size_raw()) return *this;
  452. const uchar16_t* a = ref->c_str();
  453. if (UTF16_IS_SURROGATE_HI(a[pos]))
  454. pos += 2; // TODO: check for valid low surrogate?
  455. else ++pos;
  456. if (pos > ref->size_raw()) pos = ref->size_raw();
  457. return *this;
  458. }
  459. //! Switch to the next full character in the string, returning the previous position.
  460. _Iter operator++(int)
  461. { // iterator++
  462. _Iter _tmp(*this);
  463. ++*this;
  464. return _tmp;
  465. }
  466. //! Switch to the previous full character in the string.
  467. _Iter& operator--()
  468. { // --iterator
  469. if (pos == 0) return *this;
  470. const uchar16_t* a = ref->c_str();
  471. --pos;
  472. if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0) // low surrogate, go back one more.
  473. --pos;
  474. return *this;
  475. }
  476. //! Switch to the previous full character in the string, returning the previous position.
  477. _Iter operator--(int)
  478. { // iterator--
  479. _Iter _tmp(*this);
  480. --*this;
  481. return _tmp;
  482. }
  483. //! Advance a specified number of full characters in the string.
  484. //! \return Myself.
  485. _Iter& operator+=(const difference_type v)
  486. {
  487. if (v == 0) return *this;
  488. if (v < 0) return operator-=(v * -1);
  489. if (pos >= ref->size_raw())
  490. return *this;
  491. // Go to the appropriate position.
  492. // TODO: Don't force u32 on an x64 OS. Make it agnostic.
  493. u32 i = (u32)v;
  494. u32 sr = ref->size_raw();
  495. const uchar16_t* a = ref->c_str();
  496. while (i != 0 && pos < sr)
  497. {
  498. if (UTF16_IS_SURROGATE_HI(a[pos]))
  499. pos += 2;
  500. else ++pos;
  501. --i;
  502. }
  503. if (pos > sr)
  504. pos = sr;
  505. return *this;
  506. }
  507. //! Go back a specified number of full characters in the string.
  508. //! \return Myself.
  509. _Iter& operator-=(const difference_type v)
  510. {
  511. if (v == 0) return *this;
  512. if (v > 0) return operator+=(v * -1);
  513. if (pos == 0)
  514. return *this;
  515. // Go to the appropriate position.
  516. // TODO: Don't force u32 on an x64 OS. Make it agnostic.
  517. u32 i = (u32)v;
  518. const uchar16_t* a = ref->c_str();
  519. while (i != 0 && pos != 0)
  520. {
  521. --pos;
  522. if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
  523. --pos;
  524. --i;
  525. }
  526. return *this;
  527. }
  528. //! Return a new iterator that is a variable number of full characters forward from the current position.
  529. _Iter operator+(const difference_type v) const
  530. {
  531. _Iter ret(*this);
  532. ret += v;
  533. return ret;
  534. }
  535. //! Return a new iterator that is a variable number of full characters backward from the current position.
  536. _Iter operator-(const difference_type v) const
  537. {
  538. _Iter ret(*this);
  539. ret -= v;
  540. return ret;
  541. }
  542. //! Returns the distance between two iterators.
  543. difference_type operator-(const _Iter& iter) const
  544. {
  545. // Make sure we reference the same object!
  546. if (ref != iter.ref)
  547. return difference_type();
  548. _Iter i = iter;
  549. difference_type ret;
  550. // Walk up.
  551. if (pos > i.pos)
  552. {
  553. while (pos > i.pos)
  554. {
  555. ++i;
  556. ++ret;
  557. }
  558. return ret;
  559. }
  560. // Walk down.
  561. while (pos < i.pos)
  562. {
  563. --i;
  564. --ret;
  565. }
  566. return ret;
  567. }
  568. //! Accesses the full character at the iterator's position.
  569. const_reference operator*() const
  570. {
  571. if (pos >= ref->size_raw())
  572. {
  573. const uchar16_t* a = ref->c_str();
  574. u32 p = ref->size_raw();
  575. if (UTF16_IS_SURROGATE_LO(a[p]))
  576. --p;
  577. reference ret(ref, p);
  578. return ret;
  579. }
  580. const_reference ret(ref, pos);
  581. return ret;
  582. }
  583. //! Accesses the full character at the iterator's position.
  584. reference operator*()
  585. {
  586. if (pos >= ref->size_raw())
  587. {
  588. const uchar16_t* a = ref->c_str();
  589. u32 p = ref->size_raw();
  590. if (UTF16_IS_SURROGATE_LO(a[p]))
  591. --p;
  592. reference ret(ref, p);
  593. return ret;
  594. }
  595. reference ret(ref, pos);
  596. return ret;
  597. }
  598. //! Accesses the full character at the iterator's position.
  599. const_pointer operator->() const
  600. {
  601. return operator*();
  602. }
  603. //! Accesses the full character at the iterator's position.
  604. pointer operator->()
  605. {
  606. return operator*();
  607. }
  608. //! Is the iterator at the start of the string?
  609. bool atStart() const
  610. {
  611. return pos == 0;
  612. }
  613. //! Is the iterator at the end of the string?
  614. bool atEnd() const
  615. {
  616. const uchar16_t* a = ref->c_str();
  617. if (UTF16_IS_SURROGATE(a[pos]))
  618. return (pos + 1) >= ref->size_raw();
  619. else return pos >= ref->size_raw();
  620. }
  621. //! Moves the iterator to the start of the string.
  622. void toStart()
  623. {
  624. pos = 0;
  625. }
  626. //! Moves the iterator to the end of the string.
  627. void toEnd()
  628. {
  629. pos = ref->size_raw();
  630. }
  631. //! Returns the iterator's position.
  632. //! \return The iterator's position.
  633. u32 getPos() const
  634. {
  635. return pos;
  636. }
  637. protected:
  638. const ustring16<TAlloc>* ref;
  639. u32 pos;
  640. };
  641. //! Iterator to iterate through a UTF-16 string.
  642. class _ustring16_iterator : public _ustring16_const_iterator
  643. {
  644. public:
  645. typedef _ustring16_iterator _Iter;
  646. typedef _ustring16_const_iterator _Base;
  647. typedef typename _Base::const_pointer const_pointer;
  648. typedef typename _Base::const_reference const_reference;
  649. typedef typename _Base::value_type value_type;
  650. typedef typename _Base::difference_type difference_type;
  651. typedef typename _Base::distance_type distance_type;
  652. typedef access pointer;
  653. typedef access reference;
  654. using _Base::pos;
  655. using _Base::ref;
  656. //! Constructors.
  657. _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
  658. _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
  659. _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
  660. //! Accesses the full character at the iterator's position.
  661. reference operator*() const
  662. {
  663. if (pos >= ref->size_raw())
  664. {
  665. const uchar16_t* a = ref->c_str();
  666. u32 p = ref->size_raw();
  667. if (UTF16_IS_SURROGATE_LO(a[p]))
  668. --p;
  669. reference ret(ref, p);
  670. return ret;
  671. }
  672. reference ret(ref, pos);
  673. return ret;
  674. }
  675. //! Accesses the full character at the iterator's position.
  676. reference operator*()
  677. {
  678. if (pos >= ref->size_raw())
  679. {
  680. const uchar16_t* a = ref->c_str();
  681. u32 p = ref->size_raw();
  682. if (UTF16_IS_SURROGATE_LO(a[p]))
  683. --p;
  684. reference ret(ref, p);
  685. return ret;
  686. }
  687. reference ret(ref, pos);
  688. return ret;
  689. }
  690. //! Accesses the full character at the iterator's position.
  691. pointer operator->() const
  692. {
  693. return operator*();
  694. }
  695. //! Accesses the full character at the iterator's position.
  696. pointer operator->()
  697. {
  698. return operator*();
  699. }
  700. };
  701. typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
  702. typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
  703. ///----------------------///
  704. /// end iterator classes ///
  705. ///----------------------///
  706. //! Default constructor
  707. ustring16()
  708. : array(0), allocated(1), used(0)
  709. {
  710. #if __BYTE_ORDER == __BIG_ENDIAN
  711. encoding = unicode::EUTFE_UTF16_BE;
  712. #else
  713. encoding = unicode::EUTFE_UTF16_LE;
  714. #endif
  715. array = allocator.allocate(1); // new u16[1];
  716. array[0] = 0x0;
  717. }
  718. //! Constructor
  719. ustring16(const ustring16<TAlloc>& other)
  720. : array(0), allocated(0), used(0)
  721. {
  722. #if __BYTE_ORDER == __BIG_ENDIAN
  723. encoding = unicode::EUTFE_UTF16_BE;
  724. #else
  725. encoding = unicode::EUTFE_UTF16_LE;
  726. #endif
  727. *this = other;
  728. }
  729. //! Constructor from other string types
  730. template <class B, class A>
  731. ustring16(const string<B, A>& other)
  732. : array(0), allocated(0), used(0)
  733. {
  734. #if __BYTE_ORDER == __BIG_ENDIAN
  735. encoding = unicode::EUTFE_UTF16_BE;
  736. #else
  737. encoding = unicode::EUTFE_UTF16_LE;
  738. #endif
  739. *this = other;
  740. }
  741. #ifndef USTRING_NO_STL
  742. //! Constructor from std::string
  743. template <class B, class A, typename Alloc>
  744. ustring16(const std::basic_string<B, A, Alloc>& other)
  745. : array(0), allocated(0), used(0)
  746. {
  747. #if __BYTE_ORDER == __BIG_ENDIAN
  748. encoding = unicode::EUTFE_UTF16_BE;
  749. #else
  750. encoding = unicode::EUTFE_UTF16_LE;
  751. #endif
  752. *this = other.c_str();
  753. }
  754. //! Constructor from iterator.
  755. template <typename Itr>
  756. ustring16(Itr first, Itr last)
  757. : array(0), allocated(0), used(0)
  758. {
  759. #if __BYTE_ORDER == __BIG_ENDIAN
  760. encoding = unicode::EUTFE_UTF16_BE;
  761. #else
  762. encoding = unicode::EUTFE_UTF16_LE;
  763. #endif
  764. reserve(std::distance(first, last));
  765. array[used] = 0;
  766. for (; first != last; ++first)
  767. append((uchar32_t)*first);
  768. }
  769. #endif
  770. #ifndef USTRING_CPP0X_NEWLITERALS
  771. //! Constructor for copying a character string from a pointer.
  772. ustring16(const char* const c)
  773. : array(0), allocated(0), used(0)
  774. {
  775. #if __BYTE_ORDER == __BIG_ENDIAN
  776. encoding = unicode::EUTFE_UTF16_BE;
  777. #else
  778. encoding = unicode::EUTFE_UTF16_LE;
  779. #endif
  780. loadDataStream(c, strlen(c));
  781. //append((uchar8_t*)c);
  782. }
  783. //! Constructor for copying a character string from a pointer with a given length.
  784. ustring16(const char* const c, u32 length)
  785. : array(0), allocated(0), used(0)
  786. {
  787. #if __BYTE_ORDER == __BIG_ENDIAN
  788. encoding = unicode::EUTFE_UTF16_BE;
  789. #else
  790. encoding = unicode::EUTFE_UTF16_LE;
  791. #endif
  792. loadDataStream(c, length);
  793. }
  794. #endif
  795. //! Constructor for copying a UTF-8 string from a pointer.
  796. ustring16(const uchar8_t* const c)
  797. : array(0), allocated(0), used(0)
  798. {
  799. #if __BYTE_ORDER == __BIG_ENDIAN
  800. encoding = unicode::EUTFE_UTF16_BE;
  801. #else
  802. encoding = unicode::EUTFE_UTF16_LE;
  803. #endif
  804. append(c);
  805. }
  806. //! Constructor for copying a UTF-8 string from a single char.
  807. ustring16(const char c)
  808. : array(0), allocated(0), used(0)
  809. {
  810. #if __BYTE_ORDER == __BIG_ENDIAN
  811. encoding = unicode::EUTFE_UTF16_BE;
  812. #else
  813. encoding = unicode::EUTFE_UTF16_LE;
  814. #endif
  815. append((uchar32_t)c);
  816. }
  817. //! Constructor for copying a UTF-8 string from a pointer with a given length.
  818. ustring16(const uchar8_t* const c, u32 length)
  819. : array(0), allocated(0), used(0)
  820. {
  821. #if __BYTE_ORDER == __BIG_ENDIAN
  822. encoding = unicode::EUTFE_UTF16_BE;
  823. #else
  824. encoding = unicode::EUTFE_UTF16_LE;
  825. #endif
  826. append(c, length);
  827. }
  828. //! Constructor for copying a UTF-16 string from a pointer.
  829. ustring16(const uchar16_t* const c)
  830. : array(0), allocated(0), used(0)
  831. {
  832. #if __BYTE_ORDER == __BIG_ENDIAN
  833. encoding = unicode::EUTFE_UTF16_BE;
  834. #else
  835. encoding = unicode::EUTFE_UTF16_LE;
  836. #endif
  837. append(c);
  838. }
  839. //! Constructor for copying a UTF-16 string from a pointer with a given length
  840. ustring16(const uchar16_t* const c, u32 length)
  841. : array(0), allocated(0), used(0)
  842. {
  843. #if __BYTE_ORDER == __BIG_ENDIAN
  844. encoding = unicode::EUTFE_UTF16_BE;
  845. #else
  846. encoding = unicode::EUTFE_UTF16_LE;
  847. #endif
  848. append(c, length);
  849. }
  850. //! Constructor for copying a UTF-32 string from a pointer.
  851. ustring16(const uchar32_t* const c)
  852. : array(0), allocated(0), used(0)
  853. {
  854. #if __BYTE_ORDER == __BIG_ENDIAN
  855. encoding = unicode::EUTFE_UTF16_BE;
  856. #else
  857. encoding = unicode::EUTFE_UTF16_LE;
  858. #endif
  859. append(c);
  860. }
  861. //! Constructor for copying a UTF-32 from a pointer with a given length.
  862. ustring16(const uchar32_t* const c, u32 length)
  863. : array(0), allocated(0), used(0)
  864. {
  865. #if __BYTE_ORDER == __BIG_ENDIAN
  866. encoding = unicode::EUTFE_UTF16_BE;
  867. #else
  868. encoding = unicode::EUTFE_UTF16_LE;
  869. #endif
  870. append(c, length);
  871. }
  872. //! Constructor for copying a wchar_t string from a pointer.
  873. ustring16(const wchar_t* const c)
  874. : array(0), allocated(0), used(0)
  875. {
  876. #if __BYTE_ORDER == __BIG_ENDIAN
  877. encoding = unicode::EUTFE_UTF16_BE;
  878. #else
  879. encoding = unicode::EUTFE_UTF16_LE;
  880. #endif
  881. if (sizeof(wchar_t) == 4)
  882. append(reinterpret_cast<const uchar32_t* const>(c));
  883. else if (sizeof(wchar_t) == 2)
  884. append(reinterpret_cast<const uchar16_t* const>(c));
  885. else if (sizeof(wchar_t) == 1)
  886. append(reinterpret_cast<const uchar8_t* const>(c));
  887. }
  888. //! Constructor for copying a wchar_t string from a pointer with a given length.
  889. ustring16(const wchar_t* const c, u32 length)
  890. : array(0), allocated(0), used(0)
  891. {
  892. #if __BYTE_ORDER == __BIG_ENDIAN
  893. encoding = unicode::EUTFE_UTF16_BE;
  894. #else
  895. encoding = unicode::EUTFE_UTF16_LE;
  896. #endif
  897. if (sizeof(wchar_t) == 4)
  898. append(reinterpret_cast<const uchar32_t* const>(c), length);
  899. else if (sizeof(wchar_t) == 2)
  900. append(reinterpret_cast<const uchar16_t* const>(c), length);
  901. else if (sizeof(wchar_t) == 1)
  902. append(reinterpret_cast<const uchar8_t* const>(c), length);
  903. }
  904. #ifdef USTRING_CPP0X
  905. //! Constructor for moving a ustring16
  906. ustring16(ustring16<TAlloc>&& other)
  907. : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
  908. {
  909. //std::cout << "MOVE constructor" << std::endl;
  910. other.array = 0;
  911. other.allocated = 0;
  912. other.used = 0;
  913. }
  914. #endif
  915. //! Destructor
  916. ~ustring16()
  917. {
  918. allocator.deallocate(array); // delete [] array;
  919. }
  920. //! Assignment operator
  921. ustring16& operator=(const ustring16<TAlloc>& other)
  922. {
  923. if (this == &other)
  924. return *this;
  925. used = other.size_raw();
  926. if (used >= allocated)
  927. {
  928. allocator.deallocate(array); // delete [] array;
  929. allocated = used + 1;
  930. array = allocator.allocate(used + 1); //new u16[used];
  931. }
  932. const uchar16_t* p = other.c_str();
  933. for (u32 i=0; i<=used; ++i, ++p)
  934. array[i] = *p;
  935. array[used] = 0;
  936. // Validate our new UTF-16 string.
  937. validate();
  938. return *this;
  939. }
  940. #ifdef USTRING_CPP0X
  941. //! Move assignment operator
  942. ustring16& operator=(ustring16<TAlloc>&& other)
  943. {
  944. if (this != &other)
  945. {
  946. //std::cout << "MOVE operator=" << std::endl;
  947. allocator.deallocate(array);
  948. array = other.array;
  949. allocated = other.allocated;
  950. encoding = other.encoding;
  951. used = other.used;
  952. other.array = 0;
  953. other.used = 0;
  954. }
  955. return *this;
  956. }
  957. #endif
  958. //! Assignment operator for other string types
  959. template <class B, class A>
  960. ustring16<TAlloc>& operator=(const string<B, A>& other)
  961. {
  962. *this = other.c_str();
  963. return *this;
  964. }
  965. //! Assignment operator for UTF-8 strings
  966. ustring16<TAlloc>& operator=(const uchar8_t* const c)
  967. {
  968. if (!array)
  969. {
  970. array = allocator.allocate(1); //new u16[1];
  971. allocated = 1;
  972. }
  973. used = 0;
  974. array[used] = 0x0;
  975. if (!c) return *this;
  976. //! Append our string now.
  977. append(c);
  978. return *this;
  979. }
  980. //! Assignment operator for UTF-16 strings
  981. ustring16<TAlloc>& operator=(const uchar16_t* const c)
  982. {
  983. if (!array)
  984. {
  985. array = allocator.allocate(1); //new u16[1];
  986. allocated = 1;
  987. }
  988. used = 0;
  989. array[used] = 0x0;
  990. if (!c) return *this;
  991. //! Append our string now.
  992. append(c);
  993. return *this;
  994. }
  995. //! Assignment operator for UTF-32 strings
  996. ustring16<TAlloc>& operator=(const uchar32_t* const c)
  997. {
  998. if (!array)
  999. {
  1000. array = allocator.allocate(1); //new u16[1];
  1001. allocated = 1;
  1002. }
  1003. used = 0;
  1004. array[used] = 0x0;
  1005. if (!c) return *this;
  1006. //! Append our string now.
  1007. append(c);
  1008. return *this;
  1009. }
  1010. //! Assignment operator for wchar_t strings.
  1011. /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
  1012. Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
  1013. This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
  1014. ustring16<TAlloc>& operator=(const wchar_t* const c)
  1015. {
  1016. if (sizeof(wchar_t) == 4)
  1017. *this = reinterpret_cast<const uchar32_t* const>(c);
  1018. else if (sizeof(wchar_t) == 2)
  1019. *this = reinterpret_cast<const uchar16_t* const>(c);
  1020. else if (sizeof(wchar_t) == 1)
  1021. *this = reinterpret_cast<const uchar8_t* const>(c);
  1022. return *this;
  1023. }
  1024. //! Assignment operator for other strings.
  1025. /** Note that this assumes that a correct unicode string is stored in the string. **/
  1026. template <class B>
  1027. ustring16<TAlloc>& operator=(const B* const c)
  1028. {
  1029. if (sizeof(B) == 4)
  1030. *this = reinterpret_cast<const uchar32_t* const>(c);
  1031. else if (sizeof(B) == 2)
  1032. *this = reinterpret_cast<const uchar16_t* const>(c);
  1033. else if (sizeof(B) == 1)
  1034. *this = reinterpret_cast<const uchar8_t* const>(c);
  1035. return *this;
  1036. }
  1037. //! Direct access operator
  1038. access operator [](const u32 index)
  1039. {
  1040. _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
  1041. iterator iter(*this, index);
  1042. return iter.operator*();
  1043. }
  1044. //! Direct access operator
  1045. const access operator [](const u32 index) const
  1046. {
  1047. _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
  1048. const_iterator iter(*this, index);
  1049. return iter.operator*();
  1050. }
  1051. //! Equality operator
  1052. bool operator ==(const uchar16_t* const str) const
  1053. {
  1054. if (!str)
  1055. return false;
  1056. u32 i;
  1057. for(i=0; array[i] && str[i]; ++i)
  1058. if (array[i] != str[i])
  1059. return false;
  1060. return !array[i] && !str[i];
  1061. }
  1062. //! Equality operator
  1063. bool operator ==(const ustring16<TAlloc>& other) const
  1064. {
  1065. for(u32 i=0; array[i] && other.array[i]; ++i)
  1066. if (array[i] != other.array[i])
  1067. return false;
  1068. return used == other.used;
  1069. }
  1070. //! Is smaller comparator
  1071. bool operator <(const ustring16<TAlloc>& other) const
  1072. {
  1073. for(u32 i=0; array[i] && other.array[i]; ++i)
  1074. {
  1075. s32 diff = array[i] - other.array[i];
  1076. if ( diff )
  1077. return diff < 0;
  1078. }
  1079. return used < other.used;
  1080. }
  1081. //! Inequality operator
  1082. bool operator !=(const uchar16_t* const str) const
  1083. {
  1084. return !(*this == str);
  1085. }
  1086. //! Inequality operator
  1087. bool operator !=(const ustring16<TAlloc>& other) const
  1088. {
  1089. return !(*this == other);
  1090. }
  1091. //! Returns the length of a ustring16 in full characters.
  1092. //! \return Length of a ustring16 in full characters.
  1093. u32 size() const
  1094. {
  1095. const_iterator i(*this, 0);
  1096. u32 pos = 0;
  1097. while (!i.atEnd())
  1098. {
  1099. ++i;
  1100. ++pos;
  1101. }
  1102. return pos;
  1103. }
  1104. //! Informs if the ustring is empty or not.
  1105. //! \return True if the ustring is empty, false if not.
  1106. bool empty() const
  1107. {
  1108. return (size_raw() == 0);
  1109. }
  1110. //! Returns a pointer to the raw UTF-16 string data.
  1111. //! \return pointer to C-style NUL terminated array of UTF-16 code points.
  1112. const uchar16_t* c_str() const
  1113. {
  1114. return array;
  1115. }
  1116. //! Compares the first n characters of this string with another.
  1117. //! \param other Other string to compare to.
  1118. //! \param n Number of characters to compare.
  1119. //! \return True if the n first characters of both strings are equal.
  1120. bool equalsn(const ustring16<TAlloc>& other, u32 n) const
  1121. {
  1122. u32 i;
  1123. const uchar16_t* oa = other.c_str();
  1124. for(i=0; i < n && array[i] && oa[i]; ++i)
  1125. if (array[i] != oa[i])
  1126. return false;
  1127. // if one (or both) of the strings was smaller then they
  1128. // are only equal if they have the same length
  1129. return (i == n) || (used == other.used);
  1130. }
  1131. //! Compares the first n characters of this string with another.
  1132. //! \param str Other string to compare to.
  1133. //! \param n Number of characters to compare.
  1134. //! \return True if the n first characters of both strings are equal.
  1135. bool equalsn(const uchar16_t* const str, u32 n) const
  1136. {
  1137. if (!str)
  1138. return false;
  1139. u32 i;
  1140. for(i=0; i < n && array[i] && str[i]; ++i)
  1141. if (array[i] != str[i])
  1142. return false;
  1143. // if one (or both) of the strings was smaller then they
  1144. // are only equal if they have the same length
  1145. return (i == n) || (array[i] == 0 && str[i] == 0);
  1146. }
  1147. //! Appends a character to this ustring16
  1148. //! \param character The character to append.
  1149. //! \return A reference to our current string.
  1150. ustring16<TAlloc>& append(uchar32_t character)
  1151. {
  1152. if (used + 2 >= allocated)
  1153. reallocate(used + 2);
  1154. if (character > 0xFFFF)
  1155. {
  1156. used += 2;
  1157. // character will be multibyte, so split it up into a surrogate pair.
  1158. uchar16_t x = static_cast<uchar16_t>(character);
  1159. uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
  1160. uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
  1161. array[used-2] = vh;
  1162. array[used-1] = vl;
  1163. }
  1164. else
  1165. {
  1166. ++used;
  1167. array[used-1] = character;
  1168. }
  1169. array[used] = 0;
  1170. return *this;
  1171. }
  1172. //! Appends a UTF-8 string to this ustring16
  1173. //! \param other The UTF-8 string to append.
  1174. //! \param length The length of the string to append.
  1175. //! \return A reference to our current string.
  1176. ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
  1177. {
  1178. if (!other)
  1179. return *this;
  1180. // Determine if the string is long enough for a BOM.
  1181. u32 len = 0;
  1182. const uchar8_t* p = other;
  1183. do
  1184. {
  1185. ++len;
  1186. } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
  1187. // Check for BOM.
  1188. unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
  1189. if (len == unicode::BOM_ENCODE_UTF8_LEN)
  1190. {
  1191. if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
  1192. c_bom = unicode::EUTFE_UTF8;
  1193. }
  1194. // If a BOM was found, don't include it in the string.
  1195. const uchar8_t* c2 = other;
  1196. if (c_bom != unicode::EUTFE_NONE)
  1197. {
  1198. c2 = other + unicode::BOM_UTF8_LEN;
  1199. length -= unicode::BOM_UTF8_LEN;
  1200. }
  1201. // Calculate the size of the string to read in.
  1202. len = 0;
  1203. p = c2;
  1204. do
  1205. {
  1206. ++len;
  1207. } while(*p++ && len < length);
  1208. if (len > length)
  1209. len = length;
  1210. // If we need to grow the array, do it now.
  1211. if (used + len >= allocated)
  1212. reallocate(used + (len * 2));
  1213. u32 start = used;
  1214. // Convert UTF-8 to UTF-16.
  1215. u32 pos = start;
  1216. for (u32 l = 0; l<len;)
  1217. {
  1218. ++used;
  1219. if (((c2[l] >> 6) & 0x03) == 0x02)
  1220. { // Invalid continuation byte.
  1221. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1222. ++l;
  1223. }
  1224. else if (c2[l] == 0xC0 || c2[l] == 0xC1)
  1225. { // Invalid byte - overlong encoding.
  1226. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1227. ++l;
  1228. }
  1229. else if ((c2[l] & 0xF8) == 0xF0)
  1230. { // 4 bytes UTF-8, 2 bytes UTF-16.
  1231. // Check for a full string.
  1232. if ((l + 3) >= len)
  1233. {
  1234. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1235. l += 3;
  1236. break;
  1237. }
  1238. // Validate.
  1239. bool valid = true;
  1240. u8 l2 = 0;
  1241. if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
  1242. if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
  1243. if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
  1244. if (!valid)
  1245. {
  1246. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1247. l += l2;
  1248. continue;
  1249. }
  1250. // Decode.
  1251. uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
  1252. uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
  1253. uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
  1254. uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
  1255. // Split v up into a surrogate pair.
  1256. uchar16_t x = static_cast<uchar16_t>(v);
  1257. uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
  1258. uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
  1259. array[pos++] = vh;
  1260. array[pos++] = vl;
  1261. l += 4;
  1262. ++used; // Using two shorts this time, so increase used by 1.
  1263. }
  1264. else if ((c2[l] & 0xF0) == 0xE0)
  1265. { // 3 bytes UTF-8, 1 byte UTF-16.
  1266. // Check for a full string.
  1267. if ((l + 2) >= len)
  1268. {
  1269. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1270. l += 2;
  1271. break;
  1272. }
  1273. // Validate.
  1274. bool valid = true;
  1275. u8 l2 = 0;
  1276. if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
  1277. if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
  1278. if (!valid)
  1279. {
  1280. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1281. l += l2;
  1282. continue;
  1283. }
  1284. // Decode.
  1285. uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
  1286. uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
  1287. uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
  1288. array[pos++] = ch;
  1289. l += 3;
  1290. }
  1291. else if ((c2[l] & 0xE0) == 0xC0)
  1292. { // 2 bytes UTF-8, 1 byte UTF-16.
  1293. // Check for a full string.
  1294. if ((l + 1) >= len)
  1295. {
  1296. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1297. l += 1;
  1298. break;
  1299. }
  1300. // Validate.
  1301. if (((c2[l+1] >> 6) & 0x03) != 0x02)
  1302. {
  1303. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1304. ++l;
  1305. continue;
  1306. }
  1307. // Decode.
  1308. uchar8_t b1 = (c2[l] >> 2) & 0x7;
  1309. uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
  1310. uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
  1311. array[pos++] = ch;
  1312. l += 2;
  1313. }
  1314. else
  1315. { // 1 byte UTF-8, 1 byte UTF-16.
  1316. // Validate.
  1317. if (c2[l] > 0x7F)
  1318. { // Values above 0xF4 are restricted and aren't used. By now, anything above 0x7F is invalid.
  1319. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1320. }
  1321. else array[pos++] = static_cast<uchar16_t>(c2[l]);
  1322. ++l;
  1323. }
  1324. }
  1325. array[used] = 0;
  1326. // Validate our new UTF-16 string.
  1327. validate();
  1328. return *this;
  1329. }
  1330. //! Appends a UTF-16 string to this ustring16
  1331. //! \param other The UTF-16 string to append.
  1332. //! \param length The length of the string to append.
  1333. //! \return A reference to our current string.
  1334. ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
  1335. {
  1336. if (!other)
  1337. return *this;
  1338. // Determine if the string is long enough for a BOM.
  1339. u32 len = 0;
  1340. const uchar16_t* p = other;
  1341. do
  1342. {
  1343. ++len;
  1344. } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
  1345. // Check for the BOM to determine the string's endianness.
  1346. unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
  1347. if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
  1348. c_end = unicode::EUTFEE_LITTLE;
  1349. else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
  1350. c_end = unicode::EUTFEE_BIG;
  1351. // If a BOM was found, don't include it in the string.
  1352. const uchar16_t* c2 = other;
  1353. if (c_end != unicode::EUTFEE_NATIVE)
  1354. {
  1355. c2 = other + unicode::BOM_UTF16_LEN;
  1356. length -= unicode::BOM_UTF16_LEN;
  1357. }
  1358. // Calculate the size of the string to read in.
  1359. len = 0;
  1360. p = c2;
  1361. do
  1362. {
  1363. ++len;
  1364. } while(*p++ && len < length);
  1365. if (len > length)
  1366. len = length;
  1367. // If we need to grow the size of the array, do it now.
  1368. if (used + len >= allocated)
  1369. reallocate(used + (len * 2));
  1370. u32 start = used;
  1371. used += len;
  1372. // Copy the string now.
  1373. unicode::EUTF_ENDIAN m_end = getEndianness();
  1374. for (u32 l = start; l < start + len; ++l)
  1375. {
  1376. array[l] = (uchar16_t)c2[l];
  1377. if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
  1378. array[l] = unicode::swapEndian16(array[l]);
  1379. }
  1380. array[used] = 0;
  1381. // Validate our new UTF-16 string.
  1382. validate();
  1383. return *this;
  1384. }
  1385. //! Appends a UTF-32 string to this ustring16
  1386. //! \param other The UTF-32 string to append.
  1387. //! \param length The length of the string to append.
  1388. //! \return A reference to our current string.
  1389. ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
  1390. {
  1391. if (!other)
  1392. return *this;
  1393. // Check for the BOM to determine the string's endianness.
  1394. unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
  1395. if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
  1396. c_end = unicode::EUTFEE_LITTLE;
  1397. else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
  1398. c_end = unicode::EUTFEE_BIG;
  1399. // If a BOM was found, don't include it in the string.
  1400. const uchar32_t* c2 = other;
  1401. if (c_end != unicode::EUTFEE_NATIVE)
  1402. {
  1403. c2 = other + unicode::BOM_UTF32_LEN;
  1404. length -= unicode::BOM_UTF32_LEN;
  1405. }
  1406. // Calculate the size of the string to read in.
  1407. u32 len = 0;
  1408. const uchar32_t* p = c2;
  1409. do
  1410. {
  1411. ++len;
  1412. } while(*p++ && len < length);
  1413. if (len > length)
  1414. len = length;
  1415. // If we need to grow the size of the array, do it now.
  1416. // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
  1417. if (used + (len * 2) >= allocated)
  1418. reallocate(used + ((len * 2) * 2));
  1419. u32 start = used;
  1420. // Convert UTF-32 to UTF-16.
  1421. unicode::EUTF_ENDIAN m_end = getEndianness();
  1422. u32 pos = start;
  1423. for (u32 l = 0; l<len; ++l)
  1424. {
  1425. ++used;
  1426. uchar32_t ch = c2[l];
  1427. if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
  1428. ch = unicode::swapEndian32(ch);
  1429. if (ch > 0xFFFF)
  1430. {
  1431. // Split ch up into a surrogate pair as it is over 16 bits long.
  1432. uchar16_t x = static_cast<uchar16_t>(ch);
  1433. uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
  1434. uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
  1435. array[pos++] = vh;
  1436. array[pos++] = vl;
  1437. ++used; // Using two shorts, so increased used again.
  1438. }
  1439. else if (ch >= 0xD800 && ch <= 0xDFFF)
  1440. {
  1441. // Between possible UTF-16 surrogates (invalid!)
  1442. array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
  1443. }
  1444. else array[pos++] = static_cast<uchar16_t>(ch);
  1445. }
  1446. array[used] = 0;
  1447. // Validate our new UTF-16 string.
  1448. validate();
  1449. return *this;
  1450. }
  1451. //! Appends a ustring16 to this ustring16
  1452. //! \param other The string to append to this one.
  1453. //! \return A reference to our current string.
  1454. ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
  1455. {
  1456. const uchar16_t* oa = other.c_str();
  1457. u32 len = other.size_raw();
  1458. if (used + len >= allocated)
  1459. reallocate(used + len);
  1460. for (u32 l=0; l<len; ++l)
  1461. array[used+l] = oa[l];
  1462. used += len;
  1463. array[used] = 0;
  1464. return *this;
  1465. }
  1466. //! Appends a certain amount of characters of a ustring16 to this ustring16.
  1467. //! \param other The string to append to this one.
  1468. //! \param length How many characters of the other string to add to this one.
  1469. //! \return A reference to our current string.
  1470. ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
  1471. {
  1472. if (other.size() == 0)
  1473. return *this;
  1474. if (other.size() < length)
  1475. {
  1476. append(other);
  1477. return *this;
  1478. }
  1479. if (used + length * 2 >= allocated)
  1480. reallocate(used + length * 2);
  1481. const_iterator iter(other, 0);
  1482. u32 l = length;
  1483. while (!iter.atEnd() && l)
  1484. {
  1485. uchar32_t c = *iter;
  1486. append(c);
  1487. ++iter;
  1488. --l;
  1489. }
  1490. return *this;
  1491. }
  1492. //! Reserves some memory.
  1493. //! \param count The amount of characters to reserve.
  1494. void reserve(u32 count)
  1495. {
  1496. if (count < allocated)
  1497. return;
  1498. reallocate(count);
  1499. }
  1500. //! Finds first occurrence of character.
  1501. //! \param c The character to search for.
  1502. //! \return Position where the character has been found, or -1 if not found.
  1503. s32 findFirst(uchar32_t c) const
  1504. {
  1505. const_iterator i(*this, 0);
  1506. s32 pos = 0;
  1507. while (!i.atEnd())
  1508. {
  1509. uchar32_t t = *i;
  1510. if (c == t)
  1511. return pos;
  1512. ++pos;
  1513. ++i;
  1514. }
  1515. return -1;
  1516. }
  1517. //! Finds first occurrence of a character of a list.
  1518. //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
  1519. //! \param count The amount of characters in the list. Usually, this should be strlen(c).
  1520. //! \return Position where one of the characters has been found, or -1 if not found.
  1521. s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
  1522. {
  1523. if (!c || !count)
  1524. return -1;
  1525. const_iterator i(*this, 0);
  1526. s32 pos = 0;
  1527. while (!i.atEnd())
  1528. {
  1529. uchar32_t t = *i;
  1530. for (u32 j=0; j<count; ++j)
  1531. if (t == c[j])
  1532. return pos;
  1533. ++pos;
  1534. ++i;
  1535. }
  1536. return -1;
  1537. }
  1538. //! Finds first position of a character not in a given list.
  1539. //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
  1540. //! \param count The amount of characters in the list. Usually, this should be strlen(c).
  1541. //! \return Position where the character has been found, or -1 if not found.
  1542. s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
  1543. {
  1544. if (!c || !count)
  1545. return -1;
  1546. const_iterator i(*this, 0);
  1547. s32 pos = 0;
  1548. while (!i.atEnd())
  1549. {
  1550. uchar32_t t = *i;
  1551. u32 j;
  1552. for (j=0; j<count; ++j)
  1553. if (t == c[j])
  1554. break;
  1555. if (j==count)
  1556. return pos;
  1557. ++pos;
  1558. ++i;
  1559. }
  1560. return -1;
  1561. }
  1562. //! Finds last position of a character not in a given list.
  1563. //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
  1564. //! \param count The amount of characters in the list. Usually, this should be strlen(c).
  1565. //! \return Position where the character has been found, or -1 if not found.
  1566. s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
  1567. {
  1568. if (!c || !count)
  1569. return -1;
  1570. const_iterator i(end());
  1571. --i;
  1572. s32 pos = size() - 1;
  1573. while (!i.atStart())
  1574. {
  1575. uchar32_t t = *i;
  1576. u32 j;
  1577. for (j=0; j<count; ++j)
  1578. if (t == c[j])
  1579. break;
  1580. if (j==count)
  1581. return pos;
  1582. --pos;
  1583. --i;
  1584. }
  1585. return -1;
  1586. }
  1587. //! Finds next occurrence of character.
  1588. //! \param c The character to search for.
  1589. //! \param startPos The position in the string to start searching.
  1590. //! \return Position where the character has been found, or -1 if not found.
  1591. s32 findNext(uchar32_t c, u32 startPos) const
  1592. {
  1593. const_iterator i(*this, startPos);
  1594. s32 pos = startPos;
  1595. while (!i.atEnd())
  1596. {
  1597. uchar32_t t = *i;
  1598. if (t == c)
  1599. return pos;
  1600. ++pos;
  1601. ++i;
  1602. }
  1603. return -1;
  1604. }
  1605. //! Finds last occurrence of character.
  1606. //! \param c The character to search for.
  1607. //! \param start The start position of the reverse search ( default = -1, on end ).
  1608. //! \return Position where the character has been found, or -1 if not found.
  1609. s32 findLast(uchar32_t c, s32 start = -1) const
  1610. {
  1611. u32 s = size();
  1612. start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
  1613. const_iterator i(*this, start);
  1614. u32 pos = start;
  1615. while (!i.atStart())
  1616. {
  1617. uchar32_t t = *i;
  1618. if (t == c)
  1619. return pos;
  1620. --pos;
  1621. --i;
  1622. }
  1623. return -1;
  1624. }
  1625. //! Finds last occurrence of a character in a list.
  1626. //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
  1627. //! \param count The amount of characters in the list. Usually, this should be strlen(c).
  1628. //! \return Position where one of the characters has been found, or -1 if not found.
  1629. s32 findLastChar(const uchar32_t* const c, u32 count=1) const
  1630. {
  1631. if (!c || !count)
  1632. return -1;
  1633. const_iterator i(end());
  1634. --i;
  1635. s32 pos = size();
  1636. while (!i.atStart())
  1637. {
  1638. uchar32_t t = *i;
  1639. for (u32 j=0; j<count; ++j)
  1640. if (t == c[j])
  1641. return pos;
  1642. --pos;
  1643. --i;
  1644. }
  1645. return -1;
  1646. }
  1647. //! Finds another ustring16 in this ustring16.
  1648. //! \param str The string to find.
  1649. //! \param start The start position of the search.
  1650. //! \return Positions where the ustring16 has been found, or -1 if not found.
  1651. s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
  1652. {
  1653. u32 my_size = size();
  1654. u32 their_size = str.size();
  1655. if (their_size == 0 || my_size - start < their_size)
  1656. return -1;
  1657. const_iterator i(*this, start);
  1658. s32 pos = start;
  1659. while (!i.atEnd())
  1660. {
  1661. const_iterator i2(i);
  1662. const_iterator j(str, 0);
  1663. uchar32_t t1 = (uchar32_t)*i2;
  1664. uchar32_t t2 = (uchar32_t)*j;
  1665. while (t1 == t2)
  1666. {
  1667. ++i2;
  1668. ++j;
  1669. if (j.atEnd())
  1670. return pos;
  1671. t1 = (uchar32_t)*i2;
  1672. t2 = (uchar32_t)*j;
  1673. }
  1674. ++i;
  1675. ++pos;
  1676. }
  1677. return -1;
  1678. }
  1679. //! Finds another ustring16 in this ustring16.
  1680. //! \param str The string to find.
  1681. //! \param start The start position of the search.
  1682. //! \return Positions where the string has been found, or -1 if not found.
  1683. s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
  1684. {
  1685. const uchar16_t* data = str.c_str();
  1686. if (data && *data)
  1687. {
  1688. u32 len = 0;
  1689. while (data[len])
  1690. ++len;
  1691. if (len > used)
  1692. return -1;
  1693. for (u32 i=start; i<=used-len; ++i)
  1694. {
  1695. u32 j=0;
  1696. while(data[j] && array[i+j] == data[j])
  1697. ++j;
  1698. if (!data[j])
  1699. return i;
  1700. }
  1701. }
  1702. return -1;
  1703. }
  1704. //! Returns a substring.
  1705. //! \param begin: Start of substring.
  1706. //! \param length: Length of substring.
  1707. //! \return A reference to our current string.
  1708. ustring16<TAlloc> subString(u32 begin, s32 length) const
  1709. {
  1710. u32 len = size();
  1711. // if start after ustring16
  1712. // or no proper substring length
  1713. if ((length <= 0) || (begin>=len))
  1714. return ustring16<TAlloc>("");
  1715. // clamp length to maximal value
  1716. if ((length+begin) > len)
  1717. length = len-begin;
  1718. ustring16<TAlloc> o;
  1719. o.reserve((length+1) * 2);
  1720. const_iterator i(*this, begin);
  1721. while (!i.atEnd() && length)
  1722. {
  1723. o.append(*i);
  1724. ++i;
  1725. --length;
  1726. }
  1727. return o;
  1728. }
  1729. //! Appends a character to this ustring16.
  1730. //! \param c Character to append.
  1731. //! \return A reference to our current string.
  1732. ustring16<TAlloc>& operator += (char c)
  1733. {
  1734. append((uchar32_t)c);
  1735. return *this;
  1736. }
  1737. //! Appends a character to this ustring16.
  1738. //! \param c Character to append.
  1739. //! \return A reference to our current string.
  1740. ustring16<TAlloc>& operator += (uchar32_t c)
  1741. {
  1742. append(c);
  1743. return *this;
  1744. }
  1745. //! Appends a number to this ustring16.
  1746. //! \param c Number to append.
  1747. //! \return A reference to our current string.
  1748. ustring16<TAlloc>& operator += (short c)
  1749. {
  1750. append(core::stringc(c));
  1751. return *this;
  1752. }
  1753. //! Appends a number to this ustring16.
  1754. //! \param c Number to append.
  1755. //! \return A reference to our current string.
  1756. ustring16<TAlloc>& operator += (unsigned short c)
  1757. {
  1758. append(core::stringc(c));
  1759. return *this;
  1760. }
  1761. #ifdef USTRING_CPP0X_NEWLITERALS
  1762. //! Appends a number to this ustring16.
  1763. //! \param c Number to append.
  1764. //! \return A reference to our current string.
  1765. ustring16<TAlloc>& operator += (int c)
  1766. {
  1767. append(core::stringc(c));
  1768. return *this;
  1769. }
  1770. //! Appends a number to this ustring16.
  1771. //! \param c Number to append.
  1772. //! \return A reference to our current string.
  1773. ustring16<TAlloc>& operator += (unsigned int c)
  1774. {
  1775. append(core::stringc(c));
  1776. return *this;
  1777. }
  1778. #endif
  1779. //! Appends a number to this ustring16.
  1780. //! \param c Number to append.
  1781. //! \return A reference to our current string.
  1782. ustring16<TAlloc>& operator += (long c)
  1783. {
  1784. append(core::stringc(c));
  1785. return *this;
  1786. }
  1787. //! Appends a number to this ustring16.
  1788. //! \param c Number to append.
  1789. //! \return A reference to our current string.
  1790. ustring16<TAlloc>& operator += (unsigned long c)
  1791. {
  1792. append(core::stringc(c));
  1793. return *this;
  1794. }
  1795. //! Appends a number to this ustring16.
  1796. //! \param c Number to append.
  1797. //! \return A reference to our current string.
  1798. ustring16<TAlloc>& operator += (double c)
  1799. {
  1800. append(core::stringc(c));
  1801. return *this;
  1802. }
  1803. //! Appends a char ustring16 to this ustring16.
  1804. //! \param c Char ustring16 to append.
  1805. //! \return A reference to our current string.
  1806. ustring16<TAlloc>& operator += (const uchar16_t* const c)
  1807. {
  1808. append(c);
  1809. return *this;
  1810. }
  1811. //! Appends a ustring16 to this ustring16.
  1812. //! \param other ustring16 to append.
  1813. //! \return A reference to our current string.
  1814. ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
  1815. {
  1816. append(other);
  1817. return *this;
  1818. }
  1819. //! Replaces all characters of a given type with another one.
  1820. //! \param toReplace Character to replace.
  1821. //! \param replaceWith Character replacing the old one.
  1822. //! \return A reference to our current string.
  1823. ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
  1824. {
  1825. iterator i(*this, 0);
  1826. while (!i.atEnd())
  1827. {
  1828. typename ustring16<TAlloc>::access a = *i;
  1829. if ((uchar32_t)a == toReplace)
  1830. a = replaceWith;
  1831. ++i;
  1832. }
  1833. return *this;
  1834. }
  1835. //! Replaces all instances of a string with another one.
  1836. //! \param toReplace The string to replace.
  1837. //! \param replaceWith The string replacing the old one.
  1838. //! \return A reference to our current string.
  1839. ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
  1840. {
  1841. if (toReplace.size() == 0)
  1842. return *this;
  1843. const uchar16_t* other = toReplace.c_str();
  1844. const uchar16_t* replace = replaceWith.c_str();
  1845. const u32 other_size = toReplace.size_raw();
  1846. const u32 replace_size = replaceWith.size_raw();
  1847. // Determine the delta. The algorithm will change depending on the delta.
  1848. s32 delta = replace_size - other_size;
  1849. // A character for character replace. The string will not shrink or grow.
  1850. if (delta == 0)
  1851. {
  1852. s32 pos = 0;
  1853. while ((pos = find_raw(other, pos)) != -1)
  1854. {
  1855. for (u32 i = 0; i < replace_size; ++i)
  1856. array[pos + i] = replace[i];
  1857. ++pos;
  1858. }
  1859. return *this;
  1860. }
  1861. // We are going to be removing some characters. The string will shrink.
  1862. if (delta < 0)
  1863. {
  1864. u32 i = 0;
  1865. for (u32 pos = 0; pos <= used; ++i, ++pos)
  1866. {
  1867. // Is this potentially a match?
  1868. if (array[pos] == *other)
  1869. {
  1870. // Check to see if we have a match.
  1871. u32 j;
  1872. for (j = 0; j < other_size; ++j)
  1873. {
  1874. if (array[pos + j] != other[j])
  1875. break;
  1876. }
  1877. // If we have a match, replace characters.
  1878. if (j == other_size)
  1879. {
  1880. for (j = 0; j < replace_size; ++j)
  1881. array[i + j] = replace[j];
  1882. i += replace_size - 1;
  1883. pos += other_size - 1;
  1884. continue;
  1885. }
  1886. }
  1887. // No match found, just copy characters.
  1888. array[i - 1] = array[pos];
  1889. }
  1890. array[i] = 0;
  1891. used = i;
  1892. return *this;
  1893. }
  1894. // We are going to be adding characters, so the string size will increase.
  1895. // Count the number of times toReplace exists in the string so we can allocate the new size.
  1896. u32 find_count = 0;
  1897. s32 pos = 0;
  1898. while ((pos = find_raw(other, pos)) != -1)
  1899. {
  1900. ++find_count;
  1901. ++pos;
  1902. }
  1903. // Re-allocate the string now, if needed.
  1904. u32 len = delta * find_count;
  1905. if (used + len >= allocated)
  1906. reallocate(used + len);
  1907. // Start replacing.
  1908. pos = 0;
  1909. while ((pos = find_raw(other, pos)) != -1)
  1910. {
  1911. uchar16_t* start = array + pos + other_size - 1;
  1912. uchar16_t* ptr = array + used;
  1913. uchar16_t* end = array + used + delta;
  1914. // Shift characters to make room for the string.
  1915. while (ptr != start)
  1916. {
  1917. *end = *ptr;
  1918. --ptr;
  1919. --end;
  1920. }
  1921. // Add the new string now.
  1922. for (u32 i = 0; i < replace_size; ++i)
  1923. array[pos + i] = replace[i];
  1924. pos += replace_size;
  1925. used += delta;
  1926. }
  1927. // Terminate the string and return ourself.
  1928. array[used] = 0;
  1929. return *this;
  1930. }
  1931. //! Removes characters from a ustring16..
  1932. //! \param c The character to remove.
  1933. //! \return A reference to our current string.
  1934. ustring16<TAlloc>& remove(uchar32_t c)
  1935. {
  1936. u32 pos = 0;
  1937. u32 found = 0;
  1938. u32 len = (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
  1939. for (u32 i=0; i<=used; ++i)
  1940. {
  1941. uchar32_t uc32 = 0;
  1942. if (!UTF16_IS_SURROGATE_HI(array[i]))
  1943. uc32 |= array[i];
  1944. else if (i + 1 <= used)
  1945. {
  1946. // Convert the surrogate pair into a single UTF-32 character.
  1947. uc32 = unicode::toUTF32(array[i], array[i + 1]);
  1948. }
  1949. u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
  1950. if (uc32 == c)
  1951. {
  1952. found += len;
  1953. continue;
  1954. }
  1955. array[pos++] = array[i];
  1956. if (len2 == 2)
  1957. array[pos++] = array[++i];
  1958. }
  1959. used -= found;
  1960. array[used] = 0;
  1961. return *this;
  1962. }
  1963. //! Removes a ustring16 from the ustring16.
  1964. //! \param toRemove The string to remove.
  1965. //! \return A reference to our current string.
  1966. ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
  1967. {
  1968. u32 size = toRemove.size_raw();
  1969. if (size == 0) return *this;
  1970. const uchar16_t* tra = toRemove.c_str();
  1971. u32 pos = 0;
  1972. u32 found = 0;
  1973. for (u32 i=0; i<=used; ++i)
  1974. {
  1975. u32 j = 0;
  1976. while (j < size)
  1977. {
  1978. if (array[i + j] != tra[j])
  1979. break;
  1980. ++j;
  1981. }
  1982. if (j == size)
  1983. {
  1984. found += size;
  1985. i += size - 1;
  1986. continue;
  1987. }
  1988. array[pos++] = array[i];
  1989. }
  1990. used -= found;
  1991. array[used] = 0;
  1992. return *this;
  1993. }
  1994. //! Removes characters from the ustring16.
  1995. //! \param characters The characters to remove.
  1996. //! \return A reference to our current string.
  1997. ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
  1998. {
  1999. if (characters.size_raw() == 0)
  2000. return *this;
  2001. u32 pos = 0;
  2002. u32 found = 0;
  2003. const_iterator iter(characters);
  2004. for (u32 i=0; i<=used; ++i)
  2005. {
  2006. uchar32_t uc32 = 0;
  2007. if (!UTF16_IS_SURROGATE_HI(array[i]))
  2008. uc32 |= array[i];
  2009. else if (i + 1 <= used)
  2010. {
  2011. // Convert the surrogate pair into a single UTF-32 character.
  2012. uc32 = unicode::toUTF32(array[i], array[i+1]);
  2013. }
  2014. u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
  2015. bool cont = false;
  2016. iter.toStart();
  2017. while (!iter.atEnd())
  2018. {
  2019. uchar32_t c = *iter;
  2020. if (uc32 == c)
  2021. {
  2022. found += (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
  2023. ++i;
  2024. cont = true;
  2025. break;
  2026. }
  2027. ++iter;
  2028. }
  2029. if (cont) continue;
  2030. array[pos++] = array[i];
  2031. if (len2 == 2)
  2032. array[pos++] = array[++i];
  2033. }
  2034. used -= found;
  2035. array[used] = 0;
  2036. return *this;
  2037. }
  2038. //! Trims the ustring16.
  2039. //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
  2040. //! \param whitespace The characters that are to be considered as whitespace.
  2041. //! \return A reference to our current string.
  2042. ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
  2043. {
  2044. core::array<uchar32_t> utf32white = whitespace.toUTF32();
  2045. // find start and end of the substring without the specified characters
  2046. const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
  2047. if (begin == -1)
  2048. return (*this="");
  2049. const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
  2050. return (*this = subString(begin, (end +1) - begin));
  2051. }
  2052. //! Erases a character from the ustring16.
  2053. //! May be slow, because all elements following after the erased element have to be copied.
  2054. //! \param index Index of element to be erased.
  2055. //! \return A reference to our current string.
  2056. ustring16<TAlloc>& erase(u32 index)
  2057. {
  2058. _IRR_DEBUG_BREAK_IF(index>used) // access violation
  2059. iterator i(*this, index);
  2060. uchar32_t t = *i;
  2061. u32 len = (t > 0xFFFF ? 2 : 1);
  2062. for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
  2063. array[j - len] = array[j];
  2064. used -= len;
  2065. array[used] = 0;
  2066. return *this;
  2067. }
  2068. //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
  2069. //! \return A reference to our current string.
  2070. ustring16<TAlloc>& validate()
  2071. {
  2072. // Validate all unicode characters.
  2073. for (u32 i=0; i<allocated; ++i)
  2074. {
  2075. // Terminate on existing null.
  2076. if (array[i] == 0)
  2077. {
  2078. used = i;
  2079. return *this;
  2080. }
  2081. if (UTF16_IS_SURROGATE(array[i]))
  2082. {
  2083. if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
  2084. array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
  2085. else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
  2086. array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
  2087. ++i;
  2088. }
  2089. if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
  2090. array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
  2091. }
  2092. // terminate
  2093. used = 0;
  2094. if (allocated > 0)
  2095. {
  2096. used = allocated - 1;
  2097. array[used] = 0;
  2098. }
  2099. return *this;
  2100. }
  2101. //! Gets the last char of the ustring16, or 0.
  2102. //! \return The last char of the ustring16, or 0.
  2103. uchar32_t lastChar() const
  2104. {
  2105. if (used < 1)
  2106. return 0;
  2107. if (UTF16_IS_SURROGATE_LO(array[used-1]))
  2108. {
  2109. // Make sure we have a paired surrogate.
  2110. if (used < 2)
  2111. return 0;
  2112. // Check for an invalid surrogate.
  2113. if (!UTF16_IS_SURROGATE_HI(array[used-2]))
  2114. return 0;
  2115. // Convert the surrogate pair into a single UTF-32 character.
  2116. return unicode::toUTF32(array[used-2], array[used-1]);
  2117. }
  2118. else
  2119. {
  2120. return array[used-1];
  2121. }
  2122. }
  2123. //! Split the ustring16 into parts.
  2124. /** This method will split a ustring16 at certain delimiter characters
  2125. into the container passed in as reference. The type of the container
  2126. has to be given as template parameter. It must provide a push_back and
  2127. a size method.
  2128. \param ret The result container
  2129. \param c C-style ustring16 of delimiter characters
  2130. \param count Number of delimiter characters
  2131. \param ignoreEmptyTokens Flag to avoid empty substrings in the result
  2132. container. If two delimiters occur without a character in between, an
  2133. empty substring would be placed in the result. If this flag is set,
  2134. only non-empty strings are stored.
  2135. \param keepSeparators Flag which allows to add the separator to the
  2136. result ustring16. If this flag is true, the concatenation of the
  2137. substrings results in the original ustring16. Otherwise, only the
  2138. characters between the delimiters are returned.
  2139. \return The number of resulting substrings
  2140. */
  2141. template<class container>
  2142. u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
  2143. {
  2144. if (!c)
  2145. return 0;
  2146. const_iterator i(*this);
  2147. const u32 oldSize=ret.size();
  2148. u32 pos = 0;
  2149. u32 lastpos = 0;
  2150. u32 lastpospos = 0;
  2151. bool lastWasSeparator = false;
  2152. while (!i.atEnd())
  2153. {
  2154. uchar32_t ch = *i;
  2155. bool foundSeparator = false;
  2156. for (u32 j=0; j<count; ++j)
  2157. {
  2158. if (ch == c[j])
  2159. {
  2160. if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
  2161. !lastWasSeparator)
  2162. ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
  2163. foundSeparator = true;
  2164. lastpos = (keepSeparators ? pos : pos + 1);
  2165. lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
  2166. break;
  2167. }
  2168. }
  2169. lastWasSeparator = foundSeparator;
  2170. ++pos;
  2171. ++i;
  2172. }
  2173. u32 s = size() + 1;
  2174. if (s > lastpos)
  2175. ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
  2176. return ret.size()-oldSize;
  2177. }
  2178. //! Split the ustring16 into parts.
  2179. /** This method will split a ustring16 at certain delimiter characters
  2180. into the container passed in as reference. The type of the container
  2181. has to be given as template parameter. It must provide a push_back and
  2182. a size method.
  2183. \param ret The result container
  2184. \param c A unicode string of delimiter characters
  2185. \param ignoreEmptyTokens Flag to avoid empty substrings in the result
  2186. container. If two delimiters occur without a character in between, an
  2187. empty substring would be placed in the result. If this flag is set,
  2188. only non-empty strings are stored.
  2189. \param keepSeparators Flag which allows to add the separator to the
  2190. result ustring16. If this flag is true, the concatenation of the
  2191. substrings results in the original ustring16. Otherwise, only the
  2192. characters between the delimiters are returned.
  2193. \return The number of resulting substrings
  2194. */
  2195. template<class container>
  2196. u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
  2197. {
  2198. core::array<uchar32_t> v = c.toUTF32();
  2199. return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
  2200. }
  2201. //! Gets the size of the allocated memory buffer for the string.
  2202. //! \return The size of the allocated memory buffer.
  2203. u32 capacity() const
  2204. {
  2205. return allocated;
  2206. }
  2207. //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
  2208. //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
  2209. u32 size_raw() const
  2210. {
  2211. return used;
  2212. }
  2213. //! Inserts a character into the string.
  2214. //! \param c The character to insert.
  2215. //! \param pos The position to insert the character.
  2216. //! \return A reference to our current string.
  2217. ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
  2218. {
  2219. u8 len = (c > 0xFFFF ? 2 : 1);
  2220. if (used + len >= allocated)
  2221. reallocate(used + len);
  2222. used += len;
  2223. iterator iter(*this, pos);
  2224. for (u32 i = used - 2; i > iter.getPos(); --i)
  2225. array[i] = array[i - len];
  2226. if (c > 0xFFFF)
  2227. {
  2228. // c will be multibyte, so split it up into a surrogate pair.
  2229. uchar16_t x = static_cast<uchar16_t>(c);
  2230. uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
  2231. uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
  2232. array[iter.getPos()] = vh;
  2233. array[iter.getPos()+1] = vl;
  2234. }
  2235. else
  2236. {
  2237. array[iter.getPos()] = static_cast<uchar16_t>(c);
  2238. }
  2239. array[used] = 0;
  2240. return *this;
  2241. }
  2242. //! Inserts a string into the string.
  2243. //! \param c The string to insert.
  2244. //! \param pos The position to insert the string.
  2245. //! \return A reference to our current string.
  2246. ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
  2247. {
  2248. u32 len = c.size_raw();
  2249. if (len == 0) return *this;
  2250. if (used + len >= allocated)
  2251. reallocate(used + len);
  2252. used += len;
  2253. iterator iter(*this, pos);
  2254. for (u32 i = used - 2; i > iter.getPos() + len; --i)
  2255. array[i] = array[i - len];
  2256. const uchar16_t* s = c.c_str();
  2257. for (u32 i = 0; i < len; ++i)
  2258. {
  2259. array[pos++] = *s;
  2260. ++s;
  2261. }
  2262. array[used] = 0;
  2263. return *this;
  2264. }
  2265. //! Inserts a character into the string.
  2266. //! \param c The character to insert.
  2267. //! \param pos The position to insert the character.
  2268. //! \return A reference to our current string.
  2269. ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
  2270. {
  2271. if (used + 1 >= allocated)
  2272. reallocate(used + 1);
  2273. ++used;
  2274. for (u32 i = used - 1; i > pos; --i)
  2275. array[i] = array[i - 1];
  2276. array[pos] = c;
  2277. array[used] = 0;
  2278. return *this;
  2279. }
  2280. //! Removes a character from string.
  2281. //! \param pos Position of the character to remove.
  2282. //! \return A reference to our current string.
  2283. ustring16<TAlloc>& erase_raw(u32 pos)
  2284. {
  2285. for (u32 i=pos; i<=used; ++i)
  2286. {
  2287. array[i] = array[i + 1];
  2288. }
  2289. --used;
  2290. array[used] = 0;
  2291. return *this;
  2292. }
  2293. //! Replaces a character in the string.
  2294. //! \param c The new character.
  2295. //! \param pos The position of the character to replace.
  2296. //! \return A reference to our current string.
  2297. ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
  2298. {
  2299. array[pos] = c;
  2300. return *this;
  2301. }
  2302. //! Returns an iterator to the beginning of the string.
  2303. //! \return An iterator to the beginning of the string.
  2304. iterator begin()
  2305. {
  2306. iterator i(*this, 0);
  2307. return i;
  2308. }
  2309. //! Returns an iterator to the beginning of the string.
  2310. //! \return An iterator to the beginning of the string.
  2311. const_iterator begin() const
  2312. {
  2313. const_iterator i(*this, 0);
  2314. return i;
  2315. }
  2316. //! Returns an iterator to the beginning of the string.
  2317. //! \return An iterator to the beginning of the string.
  2318. const_iterator cbegin() const
  2319. {
  2320. const_iterator i(*this, 0);
  2321. return i;
  2322. }
  2323. //! Returns an iterator to the end of the string.
  2324. //! \return An iterator to the end of the string.
  2325. iterator end()
  2326. {
  2327. iterator i(*this, 0);
  2328. i.toEnd();
  2329. return i;
  2330. }
  2331. //! Returns an iterator to the end of the string.
  2332. //! \return An iterator to the end of the string.
  2333. const_iterator end() const
  2334. {
  2335. const_iterator i(*this, 0);
  2336. i.toEnd();
  2337. return i;
  2338. }
  2339. //! Returns an iterator to the end of the string.
  2340. //! \return An iterator to the end of the string.
  2341. const_iterator cend() const
  2342. {
  2343. const_iterator i(*this, 0);
  2344. i.toEnd();
  2345. return i;
  2346. }
  2347. //! Converts the string to a UTF-8 encoded string.
  2348. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2349. //! \return A string containing the UTF-8 encoded string.
  2350. core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
  2351. {
  2352. core::string<uchar8_t> ret;
  2353. ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
  2354. const_iterator iter(*this, 0);
  2355. // Add the byte order mark if the user wants it.
  2356. if (addBOM)
  2357. {
  2358. ret.append(unicode::BOM_ENCODE_UTF8[0]);
  2359. ret.append(unicode::BOM_ENCODE_UTF8[1]);
  2360. ret.append(unicode::BOM_ENCODE_UTF8[2]);
  2361. }
  2362. while (!iter.atEnd())
  2363. {
  2364. uchar32_t c = *iter;
  2365. if (c > 0xFFFF)
  2366. { // 4 bytes
  2367. uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
  2368. uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
  2369. uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
  2370. uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
  2371. ret.append(b1);
  2372. ret.append(b2);
  2373. ret.append(b3);
  2374. ret.append(b4);
  2375. }
  2376. else if (c > 0x7FF)
  2377. { // 3 bytes
  2378. uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
  2379. uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
  2380. uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
  2381. ret.append(b1);
  2382. ret.append(b2);
  2383. ret.append(b3);
  2384. }
  2385. else if (c > 0x7F)
  2386. { // 2 bytes
  2387. uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
  2388. uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
  2389. ret.append(b1);
  2390. ret.append(b2);
  2391. }
  2392. else
  2393. { // 1 byte
  2394. ret.append(static_cast<uchar8_t>(c));
  2395. }
  2396. ++iter;
  2397. }
  2398. return ret;
  2399. }
  2400. //! Converts the string to a UTF-8 encoded string array.
  2401. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2402. //! \return An array containing the UTF-8 encoded string.
  2403. core::array<uchar8_t> toUTF8(const bool addBOM = false) const
  2404. {
  2405. core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
  2406. const_iterator iter(*this, 0);
  2407. // Add the byte order mark if the user wants it.
  2408. if (addBOM)
  2409. {
  2410. ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
  2411. ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
  2412. ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
  2413. }
  2414. while (!iter.atEnd())
  2415. {
  2416. uchar32_t c = *iter;
  2417. if (c > 0xFFFF)
  2418. { // 4 bytes
  2419. uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
  2420. uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
  2421. uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
  2422. uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
  2423. ret.push_back(b1);
  2424. ret.push_back(b2);
  2425. ret.push_back(b3);
  2426. ret.push_back(b4);
  2427. }
  2428. else if (c > 0x7FF)
  2429. { // 3 bytes
  2430. uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
  2431. uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
  2432. uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
  2433. ret.push_back(b1);
  2434. ret.push_back(b2);
  2435. ret.push_back(b3);
  2436. }
  2437. else if (c > 0x7F)
  2438. { // 2 bytes
  2439. uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
  2440. uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
  2441. ret.push_back(b1);
  2442. ret.push_back(b2);
  2443. }
  2444. else
  2445. { // 1 byte
  2446. ret.push_back(static_cast<uchar8_t>(c));
  2447. }
  2448. ++iter;
  2449. }
  2450. ret.push_back(0);
  2451. return ret;
  2452. }
  2453. #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
  2454. //! Converts the string to a UTF-16 encoded string.
  2455. //! \param endian The desired endianness of the string.
  2456. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2457. //! \return A string containing the UTF-16 encoded string.
  2458. core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2459. {
  2460. core::string<char16_t> ret;
  2461. ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
  2462. // Add the BOM if specified.
  2463. if (addBOM)
  2464. {
  2465. if (endian == unicode::EUTFEE_NATIVE)
  2466. ret[0] = unicode::BOM;
  2467. else if (endian == unicode::EUTFEE_LITTLE)
  2468. {
  2469. uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(&ret[0]);
  2470. *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
  2471. *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
  2472. }
  2473. else
  2474. {
  2475. uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(&ret[0]);
  2476. *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
  2477. *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
  2478. }
  2479. }
  2480. ret.append(array);
  2481. if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
  2482. {
  2483. char16_t* ptr = ret.c_str();
  2484. for (u32 i = 0; i < ret.size(); ++i)
  2485. *ptr++ = unicode::swapEndian16(*ptr);
  2486. }
  2487. return ret;
  2488. }
  2489. #endif
  2490. //! Converts the string to a UTF-16 encoded string array.
  2491. //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
  2492. //! \param endian The desired endianness of the string.
  2493. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2494. //! \return An array containing the UTF-16 encoded string.
  2495. core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2496. {
  2497. core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
  2498. uchar16_t* ptr = ret.pointer();
  2499. // Add the BOM if specified.
  2500. if (addBOM)
  2501. {
  2502. if (endian == unicode::EUTFEE_NATIVE)
  2503. *ptr = unicode::BOM;
  2504. else if (endian == unicode::EUTFEE_LITTLE)
  2505. {
  2506. uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
  2507. *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
  2508. *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
  2509. }
  2510. else
  2511. {
  2512. uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
  2513. *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
  2514. *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
  2515. }
  2516. ++ptr;
  2517. }
  2518. memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
  2519. if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
  2520. {
  2521. for (u32 i = 0; i <= used; ++i)
  2522. ptr[i] = unicode::swapEndian16(ptr[i]);
  2523. }
  2524. ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
  2525. ret.push_back(0);
  2526. return ret;
  2527. }
  2528. #ifdef USTRING_CPP0X_NEWLITERALS // C++0x
  2529. //! Converts the string to a UTF-32 encoded string.
  2530. //! \param endian The desired endianness of the string.
  2531. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2532. //! \return A string containing the UTF-32 encoded string.
  2533. core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2534. {
  2535. core::string<char32_t> ret;
  2536. ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
  2537. const_iterator iter(*this, 0);
  2538. // Add the BOM if specified.
  2539. if (addBOM)
  2540. {
  2541. if (endian == unicode::EUTFEE_NATIVE)
  2542. ret.append(unicode::BOM);
  2543. else
  2544. {
  2545. union
  2546. {
  2547. uchar32_t full;
  2548. u8 chunk[4];
  2549. } t;
  2550. if (endian == unicode::EUTFEE_LITTLE)
  2551. {
  2552. t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
  2553. t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
  2554. t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
  2555. t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
  2556. }
  2557. else
  2558. {
  2559. t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
  2560. t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
  2561. t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
  2562. t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
  2563. }
  2564. ret.append(t.full);
  2565. }
  2566. }
  2567. while (!iter.atEnd())
  2568. {
  2569. uchar32_t c = *iter;
  2570. if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
  2571. c = unicode::swapEndian32(c);
  2572. ret.append(c);
  2573. ++iter;
  2574. }
  2575. return ret;
  2576. }
  2577. #endif
  2578. //! Converts the string to a UTF-32 encoded string array.
  2579. //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
  2580. //! \param endian The desired endianness of the string.
  2581. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2582. //! \return An array containing the UTF-32 encoded string.
  2583. core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2584. {
  2585. core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
  2586. const_iterator iter(*this, 0);
  2587. // Add the BOM if specified.
  2588. if (addBOM)
  2589. {
  2590. if (endian == unicode::EUTFEE_NATIVE)
  2591. ret.push_back(unicode::BOM);
  2592. else
  2593. {
  2594. union
  2595. {
  2596. uchar32_t full;
  2597. u8 chunk[4];
  2598. } t;
  2599. if (endian == unicode::EUTFEE_LITTLE)
  2600. {
  2601. t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
  2602. t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
  2603. t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
  2604. t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
  2605. }
  2606. else
  2607. {
  2608. t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
  2609. t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
  2610. t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
  2611. t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
  2612. }
  2613. ret.push_back(t.full);
  2614. }
  2615. }
  2616. ret.push_back(0);
  2617. while (!iter.atEnd())
  2618. {
  2619. uchar32_t c = *iter;
  2620. if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
  2621. c = unicode::swapEndian32(c);
  2622. ret.push_back(c);
  2623. ++iter;
  2624. }
  2625. return ret;
  2626. }
  2627. //! Converts the string to a wchar_t encoded string.
  2628. /** The size of a wchar_t changes depending on the platform. This function will store a
  2629. correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
  2630. //! \param endian The desired endianness of the string.
  2631. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2632. //! \return A string containing the wchar_t encoded string.
  2633. core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2634. {
  2635. if (sizeof(wchar_t) == 4)
  2636. {
  2637. core::array<uchar32_t> a(toUTF32(endian, addBOM));
  2638. core::stringw ret(a.pointer());
  2639. return ret;
  2640. }
  2641. else if (sizeof(wchar_t) == 2)
  2642. {
  2643. if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
  2644. {
  2645. core::stringw ret(array);
  2646. return ret;
  2647. }
  2648. else
  2649. {
  2650. core::array<uchar16_t> a(toUTF16(endian, addBOM));
  2651. core::stringw ret(a.pointer());
  2652. return ret;
  2653. }
  2654. }
  2655. else if (sizeof(wchar_t) == 1)
  2656. {
  2657. core::array<uchar8_t> a(toUTF8(addBOM));
  2658. core::stringw ret(a.pointer());
  2659. return ret;
  2660. }
  2661. // Shouldn't happen.
  2662. return core::stringw();
  2663. }
  2664. //! Converts the string to a wchar_t encoded string array.
  2665. /** The size of a wchar_t changes depending on the platform. This function will store a
  2666. correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
  2667. //! \param endian The desired endianness of the string.
  2668. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2669. //! \return An array containing the wchar_t encoded string.
  2670. core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2671. {
  2672. if (sizeof(wchar_t) == 4)
  2673. {
  2674. core::array<uchar32_t> a(toUTF32(endian, addBOM));
  2675. core::array<wchar_t> ret(a.size());
  2676. ret.set_used(a.size());
  2677. memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
  2678. return ret;
  2679. }
  2680. if (sizeof(wchar_t) == 2)
  2681. {
  2682. if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
  2683. {
  2684. core::array<wchar_t> ret(used);
  2685. ret.set_used(used);
  2686. memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
  2687. return ret;
  2688. }
  2689. else
  2690. {
  2691. core::array<uchar16_t> a(toUTF16(endian, addBOM));
  2692. core::array<wchar_t> ret(a.size());
  2693. ret.set_used(a.size());
  2694. memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
  2695. return ret;
  2696. }
  2697. }
  2698. if (sizeof(wchar_t) == 1)
  2699. {
  2700. core::array<uchar8_t> a(toUTF8(addBOM));
  2701. core::array<wchar_t> ret(a.size());
  2702. ret.set_used(a.size());
  2703. memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
  2704. return ret;
  2705. }
  2706. // Shouldn't happen.
  2707. return core::array<wchar_t>();
  2708. }
  2709. //! Converts the string to a properly encoded io::path string.
  2710. //! \param endian The desired endianness of the string.
  2711. //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
  2712. //! \return An io::path string containing the properly encoded string.
  2713. io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
  2714. {
  2715. #if defined(_IRR_WCHAR_FILESYSTEM)
  2716. return toWCHAR_s(endian, addBOM);
  2717. #else
  2718. return toUTF8_s(addBOM);
  2719. #endif
  2720. }
  2721. //! Loads an unknown stream of data.
  2722. //! Will attempt to determine if the stream is unicode data. Useful for loading from files.
  2723. //! \param data The data stream to load from.
  2724. //! \param data_size The length of the data string.
  2725. //! \return A reference to our current string.
  2726. ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
  2727. {
  2728. // Clear our string.
  2729. *this = "";
  2730. if (!data)
  2731. return *this;
  2732. unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
  2733. switch (e)
  2734. {
  2735. default:
  2736. case unicode::EUTFE_UTF8:
  2737. append((uchar8_t*)data, data_size);
  2738. break;
  2739. case unicode::EUTFE_UTF16:
  2740. case unicode::EUTFE_UTF16_BE:
  2741. case unicode::EUTFE_UTF16_LE:
  2742. append((uchar16_t*)data, data_size / 2);
  2743. break;
  2744. case unicode::EUTFE_UTF32:
  2745. case unicode::EUTFE_UTF32_BE:
  2746. case unicode::EUTFE_UTF32_LE:
  2747. append((uchar32_t*)data, data_size / 4);
  2748. break;
  2749. }
  2750. return *this;
  2751. }
  2752. //! Gets the encoding of the Unicode string this class contains.
  2753. //! \return An enum describing the current encoding of this string.
  2754. const unicode::EUTF_ENCODE getEncoding() const
  2755. {
  2756. return encoding;
  2757. }
  2758. //! Gets the endianness of the Unicode string this class contains.
  2759. //! \return An enum describing the endianness of this string.
  2760. const unicode::EUTF_ENDIAN getEndianness() const
  2761. {
  2762. if (encoding == unicode::EUTFE_UTF16_LE ||
  2763. encoding == unicode::EUTFE_UTF32_LE)
  2764. return unicode::EUTFEE_LITTLE;
  2765. else return unicode::EUTFEE_BIG;
  2766. }
  2767. private:
  2768. //! Reallocate the string, making it bigger or smaller.
  2769. //! \param new_size The new size of the string.
  2770. void reallocate(u32 new_size)
  2771. {
  2772. uchar16_t* old_array = array;
  2773. array = allocator.allocate(new_size + 1); //new u16[new_size];
  2774. allocated = new_size + 1;
  2775. if (old_array == 0) return;
  2776. u32 amount = used < new_size ? used : new_size;
  2777. for (u32 i=0; i<=amount; ++i)
  2778. array[i] = old_array[i];
  2779. if (allocated <= used)
  2780. used = allocated - 1;
  2781. array[used] = 0;
  2782. allocator.deallocate(old_array); // delete [] old_array;
  2783. }
  2784. //--- member variables
  2785. uchar16_t* array;
  2786. unicode::EUTF_ENCODE encoding;
  2787. u32 allocated;
  2788. u32 used;
  2789. TAlloc allocator;
  2790. //irrAllocator<uchar16_t> allocator;
  2791. };
  2792. typedef ustring16<irrAllocator<uchar16_t> > ustring;
  2793. //! Appends two ustring16s.
  2794. template <typename TAlloc>
  2795. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
  2796. {
  2797. ustring16<TAlloc> ret(left);
  2798. ret += right;
  2799. return ret;
  2800. }
  2801. //! Appends a ustring16 and a null-terminated unicode string.
  2802. template <typename TAlloc, class B>
  2803. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
  2804. {
  2805. ustring16<TAlloc> ret(left);
  2806. ret += right;
  2807. return ret;
  2808. }
  2809. //! Appends a ustring16 and a null-terminated unicode string.
  2810. template <class B, typename TAlloc>
  2811. inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
  2812. {
  2813. ustring16<TAlloc> ret(left);
  2814. ret += right;
  2815. return ret;
  2816. }
  2817. //! Appends a ustring16 and an Irrlicht string.
  2818. template <typename TAlloc, typename B, typename BAlloc>
  2819. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
  2820. {
  2821. ustring16<TAlloc> ret(left);
  2822. ret += right;
  2823. return ret;
  2824. }
  2825. //! Appends a ustring16 and an Irrlicht string.
  2826. template <typename TAlloc, typename B, typename BAlloc>
  2827. inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
  2828. {
  2829. ustring16<TAlloc> ret(left);
  2830. ret += right;
  2831. return ret;
  2832. }
  2833. //! Appends a ustring16 and a std::basic_string.
  2834. template <typename TAlloc, typename B, typename A, typename BAlloc>
  2835. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
  2836. {
  2837. ustring16<TAlloc> ret(left);
  2838. ret += right;
  2839. return ret;
  2840. }
  2841. //! Appends a ustring16 and a std::basic_string.
  2842. template <typename TAlloc, typename B, typename A, typename BAlloc>
  2843. inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
  2844. {
  2845. ustring16<TAlloc> ret(left);
  2846. ret += right;
  2847. return ret;
  2848. }
  2849. //! Appends a ustring16 and a char.
  2850. template <typename TAlloc>
  2851. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
  2852. {
  2853. ustring16<TAlloc> ret(left);
  2854. ret += right;
  2855. return ret;
  2856. }
  2857. //! Appends a ustring16 and a char.
  2858. template <typename TAlloc>
  2859. inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
  2860. {
  2861. ustring16<TAlloc> ret(left);
  2862. ret += right;
  2863. return ret;
  2864. }
  2865. #ifdef USTRING_CPP0X_NEWLITERALS
  2866. //! Appends a ustring16 and a uchar32_t.
  2867. template <typename TAlloc>
  2868. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
  2869. {
  2870. ustring16<TAlloc> ret(left);
  2871. ret += right;
  2872. return ret;
  2873. }
  2874. //! Appends a ustring16 and a uchar32_t.
  2875. template <typename TAlloc>
  2876. inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
  2877. {
  2878. ustring16<TAlloc> ret(left);
  2879. ret += right;
  2880. return ret;
  2881. }
  2882. #endif
  2883. //! Appends a ustring16 and a short.
  2884. template <typename TAlloc>
  2885. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
  2886. {
  2887. ustring16<TAlloc> ret(left);
  2888. ret += core::stringc(right);
  2889. return ret;
  2890. }
  2891. //! Appends a ustring16 and a short.
  2892. template <typename TAlloc>
  2893. inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
  2894. {
  2895. ustring16<TAlloc> ret((core::stringc(left)));
  2896. ret += right;
  2897. return ret;
  2898. }
  2899. //! Appends a ustring16 and an unsigned short.
  2900. template <typename TAlloc>
  2901. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
  2902. {
  2903. ustring16<TAlloc> ret(left);
  2904. ret += core::stringc(right);
  2905. return ret;
  2906. }
  2907. //! Appends a ustring16 and an unsigned short.
  2908. template <typename TAlloc>
  2909. inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
  2910. {
  2911. ustring16<TAlloc> ret((core::stringc(left)));
  2912. ret += right;
  2913. return ret;
  2914. }
  2915. //! Appends a ustring16 and an int.
  2916. template <typename TAlloc>
  2917. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
  2918. {
  2919. ustring16<TAlloc> ret(left);
  2920. ret += core::stringc(right);
  2921. return ret;
  2922. }
  2923. //! Appends a ustring16 and an int.
  2924. template <typename TAlloc>
  2925. inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
  2926. {
  2927. ustring16<TAlloc> ret((core::stringc(left)));
  2928. ret += right;
  2929. return ret;
  2930. }
  2931. //! Appends a ustring16 and an unsigned int.
  2932. template <typename TAlloc>
  2933. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
  2934. {
  2935. ustring16<TAlloc> ret(left);
  2936. ret += core::stringc(right);
  2937. return ret;
  2938. }
  2939. //! Appends a ustring16 and an unsigned int.
  2940. template <typename TAlloc>
  2941. inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
  2942. {
  2943. ustring16<TAlloc> ret((core::stringc(left)));
  2944. ret += right;
  2945. return ret;
  2946. }
  2947. //! Appends a ustring16 and a long.
  2948. template <typename TAlloc>
  2949. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
  2950. {
  2951. ustring16<TAlloc> ret(left);
  2952. ret += core::stringc(right);
  2953. return ret;
  2954. }
  2955. //! Appends a ustring16 and a long.
  2956. template <typename TAlloc>
  2957. inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
  2958. {
  2959. ustring16<TAlloc> ret((core::stringc(left)));
  2960. ret += right;
  2961. return ret;
  2962. }
  2963. //! Appends a ustring16 and an unsigned long.
  2964. template <typename TAlloc>
  2965. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
  2966. {
  2967. ustring16<TAlloc> ret(left);
  2968. ret += core::stringc(right);
  2969. return ret;
  2970. }
  2971. //! Appends a ustring16 and an unsigned long.
  2972. template <typename TAlloc>
  2973. inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
  2974. {
  2975. ustring16<TAlloc> ret((core::stringc(left)));
  2976. ret += right;
  2977. return ret;
  2978. }
  2979. //! Appends a ustring16 and a float.
  2980. template <typename TAlloc>
  2981. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
  2982. {
  2983. ustring16<TAlloc> ret(left);
  2984. ret += core::stringc(right);
  2985. return ret;
  2986. }
  2987. //! Appends a ustring16 and a float.
  2988. template <typename TAlloc>
  2989. inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
  2990. {
  2991. ustring16<TAlloc> ret((core::stringc(left)));
  2992. ret += right;
  2993. return ret;
  2994. }
  2995. //! Appends a ustring16 and a double.
  2996. template <typename TAlloc>
  2997. inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
  2998. {
  2999. ustring16<TAlloc> ret(left);
  3000. ret += core::stringc(right);
  3001. return ret;
  3002. }
  3003. //! Appends a ustring16 and a double.
  3004. template <typename TAlloc>
  3005. inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
  3006. {
  3007. ustring16<TAlloc> ret((core::stringc(left)));
  3008. ret += right;
  3009. return ret;
  3010. }
  3011. #ifdef USTRING_CPP0X
  3012. //! Appends two ustring16s.
  3013. template <typename TAlloc>
  3014. inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
  3015. {
  3016. //std::cout << "MOVE operator+(&, &&)" << std::endl;
  3017. right.insert(left, 0);
  3018. return std::move(right);
  3019. }
  3020. //! Appends two ustring16s.
  3021. template <typename TAlloc>
  3022. inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
  3023. {
  3024. //std::cout << "MOVE operator+(&&, &)" << std::endl;
  3025. left.append(right);
  3026. return std::move(left);
  3027. }
  3028. //! Appends two ustring16s.
  3029. template <typename TAlloc>
  3030. inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
  3031. {
  3032. //std::cout << "MOVE operator+(&&, &&)" << std::endl;
  3033. if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
  3034. (right.capacity() - right.size_raw() < left.size_raw()))
  3035. {
  3036. left.append(right);
  3037. return std::move(left);
  3038. }
  3039. else
  3040. {
  3041. right.insert(left, 0);
  3042. return std::move(right);
  3043. }
  3044. }
  3045. //! Appends a ustring16 and a null-terminated unicode string.
  3046. template <typename TAlloc, class B>
  3047. inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
  3048. {
  3049. //std::cout << "MOVE operator+(&&, B*)" << std::endl;
  3050. left.append(right);
  3051. return std::move(left);
  3052. }
  3053. //! Appends a ustring16 and a null-terminated unicode string.
  3054. template <class B, typename TAlloc>
  3055. inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
  3056. {
  3057. //std::cout << "MOVE operator+(B*, &&)" << std::endl;
  3058. right.insert(left, 0);
  3059. return std::move(right);
  3060. }
  3061. //! Appends a ustring16 and an Irrlicht string.
  3062. template <typename TAlloc, typename B, typename BAlloc>
  3063. inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
  3064. {
  3065. //std::cout << "MOVE operator+(&, &&)" << std::endl;
  3066. right.insert(left, 0);
  3067. return std::move(right);
  3068. }
  3069. //! Appends a ustring16 and an Irrlicht string.
  3070. template <typename TAlloc, typename B, typename BAlloc>
  3071. inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
  3072. {
  3073. //std::cout << "MOVE operator+(&&, &)" << std::endl;
  3074. left.append(right);
  3075. return std::move(left);
  3076. }
  3077. //! Appends a ustring16 and a std::basic_string.
  3078. template <typename TAlloc, typename B, typename A, typename BAlloc>
  3079. inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
  3080. {
  3081. //std::cout << "MOVE operator+(&, &&)" << std::endl;
  3082. right.insert(core::ustring16<TAlloc>(left), 0);
  3083. return std::move(right);
  3084. }
  3085. //! Appends a ustring16 and a std::basic_string.
  3086. template <typename TAlloc, typename B, typename A, typename BAlloc>
  3087. inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
  3088. {
  3089. //std::cout << "MOVE operator+(&&, &)" << std::endl;
  3090. left.append(right);
  3091. return std::move(left);
  3092. }
  3093. //! Appends a ustring16 and a char.
  3094. template <typename TAlloc>
  3095. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
  3096. {
  3097. left.append((uchar32_t)right);
  3098. return std::move(left);
  3099. }
  3100. //! Appends a ustring16 and a char.
  3101. template <typename TAlloc>
  3102. inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
  3103. {
  3104. right.insert((uchar32_t)left, 0);
  3105. return std::move(right);
  3106. }
  3107. #ifdef USTRING_CPP0X_NEWLITERALS
  3108. //! Appends a ustring16 and a uchar32_t.
  3109. template <typename TAlloc>
  3110. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
  3111. {
  3112. left.append(right);
  3113. return std::move(left);
  3114. }
  3115. //! Appends a ustring16 and a uchar32_t.
  3116. template <typename TAlloc>
  3117. inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
  3118. {
  3119. right.insert(left, 0);
  3120. return std::move(right);
  3121. }
  3122. #endif
  3123. //! Appends a ustring16 and a short.
  3124. template <typename TAlloc>
  3125. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
  3126. {
  3127. left.append(core::stringc(right));
  3128. return std::move(left);
  3129. }
  3130. //! Appends a ustring16 and a short.
  3131. template <typename TAlloc>
  3132. inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
  3133. {
  3134. right.insert(core::stringc(left), 0);
  3135. return std::move(right);
  3136. }
  3137. //! Appends a ustring16 and an unsigned short.
  3138. template <typename TAlloc>
  3139. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
  3140. {
  3141. left.append(core::stringc(right));
  3142. return std::move(left);
  3143. }
  3144. //! Appends a ustring16 and an unsigned short.
  3145. template <typename TAlloc>
  3146. inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
  3147. {
  3148. right.insert(core::stringc(left), 0);
  3149. return std::move(right);
  3150. }
  3151. //! Appends a ustring16 and an int.
  3152. template <typename TAlloc>
  3153. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
  3154. {
  3155. left.append(core::stringc(right));
  3156. return std::move(left);
  3157. }
  3158. //! Appends a ustring16 and an int.
  3159. template <typename TAlloc>
  3160. inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
  3161. {
  3162. right.insert(core::stringc(left), 0);
  3163. return std::move(right);
  3164. }
  3165. //! Appends a ustring16 and an unsigned int.
  3166. template <typename TAlloc>
  3167. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
  3168. {
  3169. left.append(core::stringc(right));
  3170. return std::move(left);
  3171. }
  3172. //! Appends a ustring16 and an unsigned int.
  3173. template <typename TAlloc>
  3174. inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
  3175. {
  3176. right.insert(core::stringc(left), 0);
  3177. return std::move(right);
  3178. }
  3179. //! Appends a ustring16 and a long.
  3180. template <typename TAlloc>
  3181. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
  3182. {
  3183. left.append(core::stringc(right));
  3184. return std::move(left);
  3185. }
  3186. //! Appends a ustring16 and a long.
  3187. template <typename TAlloc>
  3188. inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
  3189. {
  3190. right.insert(core::stringc(left), 0);
  3191. return std::move(right);
  3192. }
  3193. //! Appends a ustring16 and an unsigned long.
  3194. template <typename TAlloc>
  3195. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
  3196. {
  3197. left.append(core::stringc(right));
  3198. return std::move(left);
  3199. }
  3200. //! Appends a ustring16 and an unsigned long.
  3201. template <typename TAlloc>
  3202. inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
  3203. {
  3204. right.insert(core::stringc(left), 0);
  3205. return std::move(right);
  3206. }
  3207. //! Appends a ustring16 and a float.
  3208. template <typename TAlloc>
  3209. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
  3210. {
  3211. left.append(core::stringc(right));
  3212. return std::move(left);
  3213. }
  3214. //! Appends a ustring16 and a float.
  3215. template <typename TAlloc>
  3216. inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
  3217. {
  3218. right.insert(core::stringc(left), 0);
  3219. return std::move(right);
  3220. }
  3221. //! Appends a ustring16 and a double.
  3222. template <typename TAlloc>
  3223. inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
  3224. {
  3225. left.append(core::stringc(right));
  3226. return std::move(left);
  3227. }
  3228. //! Appends a ustring16 and a double.
  3229. template <typename TAlloc>
  3230. inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
  3231. {
  3232. right.insert(core::stringc(left), 0);
  3233. return std::move(right);
  3234. }
  3235. #endif
  3236. #ifndef USTRING_NO_STL
  3237. //! Writes a ustring16 to an ostream.
  3238. template <typename TAlloc>
  3239. inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
  3240. {
  3241. out << in.toUTF8_s().c_str();
  3242. return out;
  3243. }
  3244. //! Writes a ustring16 to a wostream.
  3245. template <typename TAlloc>
  3246. inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
  3247. {
  3248. out << in.toWCHAR_s().c_str();
  3249. return out;
  3250. }
  3251. #endif
  3252. #ifndef USTRING_NO_STL
  3253. namespace unicode
  3254. {
  3255. //! Hashing algorithm for hashing a ustring. Used for things like unordered_maps.
  3256. //! Algorithm taken from std::hash<std::string>.
  3257. class hash : public std::unary_function<core::ustring, size_t>
  3258. {
  3259. public:
  3260. size_t operator()(const core::ustring& s) const
  3261. {
  3262. size_t ret = 2166136261U;
  3263. size_t index = 0;
  3264. size_t stride = 1 + s.size_raw() / 10;
  3265. core::ustring::const_iterator i = s.begin();
  3266. while (i != s.end())
  3267. {
  3268. // TODO: Don't force u32 on an x64 OS. Make it agnostic.
  3269. ret = 16777619U * ret ^ (size_t)s[(u32)index];
  3270. index += stride;
  3271. i += stride;
  3272. }
  3273. return (ret);
  3274. }
  3275. };
  3276. } // end namespace unicode
  3277. #endif
  3278. } // end namespace core
  3279. } // end namespace irr