sed.c 35 KB


  1. /* vi: set sw=4 ts=4: */
  2. /*
  3. * sed.c - very minimalist version of sed
  4. *
  5. * Copyright (C) 1999,2000,2001 by Lineo, inc. and Mark Whitley
  6. * Copyright (C) 1999,2000,2001 by Mark Whitley <markw@codepoet.org>
  7. * Copyright (C) 2002 Matt Kraai
  8. * Copyright (C) 2003 by Glenn McGrath <bug1@iinet.net.au>
  9. * Copyright (C) 2003,2004 by Rob Landley <rob@landley.net>
  10. *
  11. * MAINTAINER: Rob Landley <rob@landley.net>
  12. *
  13. * Licensed under GPL version 2, see file LICENSE in this tarball for details.
  14. */
  15. /* Code overview.
  16. Files are laid out to avoid unnecessary function declarations. So for
  17. example, every function add_cmd calls occurs before add_cmd in this file.
  18. add_cmd() is called on each line of sed command text (from a file or from
  19. the command line). It calls get_address() and parse_cmd_args(). The
  20. resulting sed_cmd_t structures are appended to a linked list
  21. (G.sed_cmd_head/G.sed_cmd_tail).
  22. add_input_file() adds a FILE * to the list of input files. We need to
  23. know all input sources ahead of time to find the last line for the $ match.
  24. process_files() does actual sedding, reading data lines from each input FILE *
  25. (which could be stdin) and applying the sed command list (sed_cmd_head) to
  26. each of the resulting lines.
  27. sed_main() is where external code calls into this, with a command line.
  28. */
  29. /*
  30. Supported features and commands in this version of sed:
  31. - comments ('#')
  32. - address matching: num|/matchstr/[,num|/matchstr/|$]command
  33. - commands: (p)rint, (d)elete, (s)ubstitue (with g & I flags)
  34. - edit commands: (a)ppend, (i)nsert, (c)hange
  35. - file commands: (r)ead
  36. - backreferences in substitution expressions (\0, \1, \2...\9)
  37. - grouped commands: {cmd1;cmd2}
  38. - transliteration (y/source-chars/dest-chars/)
  39. - pattern space hold space storing / swapping (g, h, x)
  40. - labels / branching (: label, b, t, T)
  41. (Note: Specifying an address (range) to match is *optional*; commands
  42. default to the whole pattern space if no specific address match was
  43. requested.)
  44. Todo:
  45. - Create a wrapper around regex to make libc's regex conform with sed
  46. Reference http://www.opengroup.org/onlinepubs/007904975/utilities/sed.html
  47. */
  48. #include "libbb.h"
  49. #include "xregex.h"
  50. /* Each sed command turns into one of these structures. */
  51. typedef struct sed_cmd_s {
  52. /* Ordered by alignment requirements: currently 36 bytes on x86 */
  53. struct sed_cmd_s *next; /* Next command (linked list, NULL terminated) */
  54. /* address storage */
  55. regex_t *beg_match; /* sed -e '/match/cmd' */
  56. regex_t *end_match; /* sed -e '/match/,/end_match/cmd' */
  57. regex_t *sub_match; /* For 's/sub_match/string/' */
  58. int beg_line; /* 'sed 1p' 0 == apply commands to all lines */
  59. int end_line; /* 'sed 1,3p' 0 == one line only. -1 = last line ($) */
  60. FILE *sw_file; /* File (sw) command writes to, -1 for none. */
  61. char *string; /* Data string for (saicytb) commands. */
  62. unsigned short which_match; /* (s) Which match to replace (0 for all) */
  63. /* Bitfields (gcc won't group them if we don't) */
  64. unsigned invert:1; /* the '!' after the address */
  65. unsigned in_match:1; /* Next line also included in match? */
  66. unsigned sub_p:1; /* (s) print option */
  67. char sw_last_char; /* Last line written by (sw) had no '\n' */
  68. /* GENERAL FIELDS */
  69. char cmd; /* The command char: abcdDgGhHilnNpPqrstwxy:={} */
  70. } sed_cmd_t;
  71. static const char semicolon_whitespace[] ALIGN1 = "; \n\r\t\v";
  72. struct globals {
  73. /* options */
  74. int be_quiet, regex_type;
  75. FILE *nonstdout;
  76. char *outname, *hold_space;
  77. /* List of input files */
  78. int input_file_count, current_input_file;
  79. FILE **input_file_list;
  80. regmatch_t regmatch[10];
  81. regex_t *previous_regex_ptr;
  82. /* linked list of sed commands */
  83. sed_cmd_t sed_cmd_head, *sed_cmd_tail;
  84. /* Linked list of append lines */
  85. llist_t *append_head;
  86. char *add_cmd_line;
  87. struct pipeline {
  88. char *buf; /* Space to hold string */
  89. int idx; /* Space used */
  90. int len; /* Space allocated */
  91. } pipeline;
  92. };
  93. #define G (*(struct globals*)&bb_common_bufsiz1)
  94. void BUG_sed_globals_too_big(void);
  95. #define INIT_G() do { \
  96. if (sizeof(struct globals) > COMMON_BUFSIZE) \
  97. BUG_sed_globals_too_big(); \
  98. G.sed_cmd_tail = &G.sed_cmd_head; \
  99. } while (0)
  100. #if ENABLE_FEATURE_CLEAN_UP
  101. static void sed_free_and_close_stuff(void)
  102. {
  103. sed_cmd_t *sed_cmd = G.sed_cmd_head.next;
  104. llist_free(G.append_head, free);
  105. while (sed_cmd) {
  106. sed_cmd_t *sed_cmd_next = sed_cmd->next;
  107. if (sed_cmd->sw_file)
  108. xprint_and_close_file(sed_cmd->sw_file);
  109. if (sed_cmd->beg_match) {
  110. regfree(sed_cmd->beg_match);
  111. free(sed_cmd->beg_match);
  112. }
  113. if (sed_cmd->end_match) {
  114. regfree(sed_cmd->end_match);
  115. free(sed_cmd->end_match);
  116. }
  117. if (sed_cmd->sub_match) {
  118. regfree(sed_cmd->sub_match);
  119. free(sed_cmd->sub_match);
  120. }
  121. free(sed_cmd->string);
  122. free(sed_cmd);
  123. sed_cmd = sed_cmd_next;
  124. }
  125. if (G.hold_space) free(G.hold_space);
  126. while (G.current_input_file < G.input_file_count)
  127. fclose(G.input_file_list[G.current_input_file++]);
  128. }
  129. #else
  130. void sed_free_and_close_stuff(void);
  131. #endif
  132. /* If something bad happens during -i operation, delete temp file */
  133. static void cleanup_outname(void)
  134. {
  135. if (G.outname) unlink(G.outname);
  136. }
  137. /* strdup, replacing "\n" with '\n', and "\delimiter" with 'delimiter' */
  138. static void parse_escapes(char *dest, const char *string, int len, char from, char to)
  139. {
  140. int i = 0;
  141. while (i < len) {
  142. if (string[i] == '\\') {
  143. if (!to || string[i+1] == from) {
  144. *dest++ = to ? to : string[i+1];
  145. i += 2;
  146. continue;
  147. }
  148. *dest++ = string[i++];
  149. }
  150. *dest++ = string[i++];
  151. }
  152. *dest = 0;
  153. }
  154. static char *copy_parsing_escapes(const char *string, int len)
  155. {
  156. char *dest = xmalloc(len + 1);
  157. parse_escapes(dest, string, len, 'n', '\n');
  158. return dest;
  159. }
  160. /*
  161. * index_of_next_unescaped_regexp_delim - walks left to right through a string
  162. * beginning at a specified index and returns the index of the next regular
  163. * expression delimiter (typically a forward * slash ('/')) not preceded by
  164. * a backslash ('\'). A negative delimiter disables square bracket checking.
  165. */
  166. static int index_of_next_unescaped_regexp_delim(int delimiter, const char *str)
  167. {
  168. int bracket = -1;
  169. int escaped = 0;
  170. int idx = 0;
  171. char ch;
  172. if (delimiter < 0) {
  173. bracket--;
  174. delimiter = -delimiter;
  175. }
  176. for (; (ch = str[idx]); idx++) {
  177. if (bracket >= 0) {
  178. if (ch == ']' && !(bracket == idx - 1 || (bracket == idx - 2
  179. && str[idx - 1] == '^')))
  180. bracket = -1;
  181. } else if (escaped)
  182. escaped = 0;
  183. else if (ch == '\\')
  184. escaped = 1;
  185. else if (bracket == -1 && ch == '[')
  186. bracket = idx;
  187. else if (ch == delimiter)
  188. return idx;
  189. }
  190. /* if we make it to here, we've hit the end of the string */
  191. bb_error_msg_and_die("unmatched '%c'", delimiter);
  192. }
  193. /*
  194. * Returns the index of the third delimiter
  195. */
  196. static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
  197. {
  198. const char *cmdstr_ptr = cmdstr;
  199. char delimiter;
  200. int idx = 0;
  201. /* verify that the 's' or 'y' is followed by something. That something
  202. * (typically a 'slash') is now our regexp delimiter... */
  203. if (*cmdstr == '\0')
  204. bb_error_msg_and_die("bad format in substitution expression");
  205. delimiter = *cmdstr_ptr++;
  206. /* save the match string */
  207. idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
  208. *match = copy_parsing_escapes(cmdstr_ptr, idx);
  209. /* save the replacement string */
  210. cmdstr_ptr += idx + 1;
  211. idx = index_of_next_unescaped_regexp_delim(-delimiter, cmdstr_ptr);
  212. *replace = copy_parsing_escapes(cmdstr_ptr, idx);
  213. return ((cmdstr_ptr - cmdstr) + idx);
  214. }
  215. /*
  216. * returns the index in the string just past where the address ends.
  217. */
  218. static int get_address(const char *my_str, int *linenum, regex_t ** regex)
  219. {
  220. const char *pos = my_str;
  221. if (isdigit(*my_str)) {
  222. *linenum = strtol(my_str, (char**)&pos, 10);
  223. /* endstr shouldnt ever equal NULL */
  224. } else if (*my_str == '$') {
  225. *linenum = -1;
  226. pos++;
  227. } else if (*my_str == '/' || *my_str == '\\') {
  228. int next;
  229. char delimiter;
  230. char *temp;
  231. delimiter = '/';
  232. if (*my_str == '\\') delimiter = *++pos;
  233. next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
  234. temp = copy_parsing_escapes(pos, next);
  235. *regex = xmalloc(sizeof(regex_t));
  236. xregcomp(*regex, temp, G.regex_type|REG_NEWLINE);
  237. free(temp);
  238. /* Move position to next character after last delimiter */
  239. pos += (next+1);
  240. }
  241. return pos - my_str;
  242. }
  243. /* Grab a filename. Whitespace at start is skipped, then goes to EOL. */
  244. static int parse_file_cmd(sed_cmd_t *sed_cmd, const char *filecmdstr, char **retval)
  245. {
  246. int start = 0, idx, hack = 0;
  247. /* Skip whitespace, then grab filename to end of line */
  248. while (isspace(filecmdstr[start]))
  249. start++;
  250. idx = start;
  251. while (filecmdstr[idx] && filecmdstr[idx] != '\n')
  252. idx++;
  253. /* If lines glued together, put backslash back. */
  254. if (filecmdstr[idx] == '\n')
  255. hack = 1;
  256. if (idx == start)
  257. bb_error_msg_and_die("empty filename");
  258. *retval = xstrndup(filecmdstr+start, idx-start+hack+1);
  259. if (hack)
  260. (*retval)[idx] = '\\';
  261. return idx;
  262. }
  263. static int parse_subst_cmd(sed_cmd_t *sed_cmd, const char *substr)
  264. {
  265. int cflags = G.regex_type;
  266. char *match;
  267. int idx;
  268. /*
  269. * A substitution command should look something like this:
  270. * s/match/replace/ #gIpw
  271. * || | |||
  272. * mandatory optional
  273. */
  274. idx = parse_regex_delim(substr, &match, &sed_cmd->string);
  275. /* determine the number of back references in the match string */
  276. /* Note: we compute this here rather than in the do_subst_command()
  277. * function to save processor time, at the expense of a little more memory
  278. * (4 bits) per sed_cmd */
  279. /* process the flags */
  280. sed_cmd->which_match = 1;
  281. while (substr[++idx]) {
  282. /* Parse match number */
  283. if (isdigit(substr[idx])) {
  284. if (match[0] != '^') {
  285. /* Match 0 treated as all, multiple matches we take the last one. */
  286. const char *pos = substr + idx;
  287. /* FIXME: error check? */
  288. sed_cmd->which_match = (unsigned short)strtol(substr+idx, (char**) &pos, 10);
  289. idx = pos - substr;
  290. }
  291. continue;
  292. }
  293. /* Skip spaces */
  294. if (isspace(substr[idx])) continue;
  295. switch (substr[idx]) {
  296. /* Replace all occurrences */
  297. case 'g':
  298. if (match[0] != '^') sed_cmd->which_match = 0;
  299. break;
  300. /* Print pattern space */
  301. case 'p':
  302. sed_cmd->sub_p = 1;
  303. break;
  304. /* Write to file */
  305. case 'w':
  306. {
  307. char *temp;
  308. idx += parse_file_cmd(sed_cmd, substr+idx, &temp);
  309. break;
  310. }
  311. /* Ignore case (gnu exension) */
  312. case 'I':
  313. cflags |= REG_ICASE;
  314. break;
  315. /* Comment */
  316. case '#':
  317. while (substr[++idx]) /*skip all*/;
  318. /* Fall through */
  319. /* End of command */
  320. case ';':
  321. case '}':
  322. goto out;
  323. default:
  324. bb_error_msg_and_die("bad option in substitution expression");
  325. }
  326. }
  327. out:
  328. /* compile the match string into a regex */
  329. if (*match != '\0') {
  330. /* If match is empty, we use last regex used at runtime */
  331. sed_cmd->sub_match = xmalloc(sizeof(regex_t));
  332. xregcomp(sed_cmd->sub_match, match, cflags);
  333. }
  334. free(match);
  335. return idx;
  336. }
  337. /*
  338. * Process the commands arguments
  339. */
  340. static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
  341. {
  342. /* handle (s)ubstitution command */
  343. if (sed_cmd->cmd == 's')
  344. cmdstr += parse_subst_cmd(sed_cmd, cmdstr);
  345. /* handle edit cmds: (a)ppend, (i)nsert, and (c)hange */
  346. else if (strchr("aic", sed_cmd->cmd)) {
  347. if ((sed_cmd->end_line || sed_cmd->end_match) && sed_cmd->cmd != 'c')
  348. bb_error_msg_and_die
  349. ("only a beginning address can be specified for edit commands");
  350. for (;;) {
  351. if (*cmdstr == '\n' || *cmdstr == '\\') {
  352. cmdstr++;
  353. break;
  354. } else if (isspace(*cmdstr))
  355. cmdstr++;
  356. else
  357. break;
  358. }
  359. sed_cmd->string = xstrdup(cmdstr);
  360. parse_escapes(sed_cmd->string, sed_cmd->string, strlen(cmdstr), 0, 0);
  361. cmdstr += strlen(cmdstr);
  362. /* handle file cmds: (r)ead */
  363. } else if (strchr("rw", sed_cmd->cmd)) {
  364. if (sed_cmd->end_line || sed_cmd->end_match)
  365. bb_error_msg_and_die("command only uses one address");
  366. cmdstr += parse_file_cmd(sed_cmd, cmdstr, &sed_cmd->string);
  367. if (sed_cmd->cmd == 'w') {
  368. sed_cmd->sw_file = xfopen(sed_cmd->string, "w");
  369. sed_cmd->sw_last_char = '\n';
  370. }
  371. /* handle branch commands */
  372. } else if (strchr(":btT", sed_cmd->cmd)) {
  373. int length;
  374. cmdstr = skip_whitespace(cmdstr);
  375. length = strcspn(cmdstr, semicolon_whitespace);
  376. if (length) {
  377. sed_cmd->string = xstrndup(cmdstr, length);
  378. cmdstr += length;
  379. }
  380. }
  381. /* translation command */
  382. else if (sed_cmd->cmd == 'y') {
  383. char *match, *replace;
  384. int i = cmdstr[0];
  385. cmdstr += parse_regex_delim(cmdstr, &match, &replace)+1;
  386. /* \n already parsed, but \delimiter needs unescaping. */
  387. parse_escapes(match, match, strlen(match), i, i);
  388. parse_escapes(replace, replace, strlen(replace), i, i);
  389. sed_cmd->string = xzalloc((strlen(match) + 1) * 2);
  390. for (i = 0; match[i] && replace[i]; i++) {
  391. sed_cmd->string[i*2] = match[i];
  392. sed_cmd->string[i*2+1] = replace[i];
  393. }
  394. free(match);
  395. free(replace);
  396. }
  397. /* if it wasnt a single-letter command that takes no arguments
  398. * then it must be an invalid command.
  399. */
  400. else if (strchr("dDgGhHlnNpPqx={}", sed_cmd->cmd) == 0) {
  401. bb_error_msg_and_die("unsupported command %c", sed_cmd->cmd);
  402. }
  403. /* give back whatever's left over */
  404. return cmdstr;
  405. }
  406. /* Parse address+command sets, skipping comment lines. */
  407. static void add_cmd(const char *cmdstr)
  408. {
  409. sed_cmd_t *sed_cmd;
  410. int temp;
  411. /* Append this line to any unfinished line from last time. */
  412. if (G.add_cmd_line) {
  413. char *tp = xasprintf("%s\n%s", G.add_cmd_line, cmdstr);
  414. free(G.add_cmd_line);
  415. cmdstr = G.add_cmd_line = tp;
  416. }
  417. /* If this line ends with backslash, request next line. */
  418. temp = strlen(cmdstr);
  419. if (temp && cmdstr[--temp] == '\\') {
  420. if (!G.add_cmd_line)
  421. G.add_cmd_line = xstrdup(cmdstr);
  422. G.add_cmd_line[temp] = '\0';
  423. return;
  424. }
  425. /* Loop parsing all commands in this line. */
  426. while (*cmdstr) {
  427. /* Skip leading whitespace and semicolons */
  428. cmdstr += strspn(cmdstr, semicolon_whitespace);
  429. /* If no more commands, exit. */
  430. if (!*cmdstr) break;
  431. /* if this is a comment, jump past it and keep going */
  432. if (*cmdstr == '#') {
  433. /* "#n" is the same as using -n on the command line */
  434. if (cmdstr[1] == 'n')
  435. G.be_quiet++;
  436. cmdstr = strpbrk(cmdstr, "\n\r");
  437. if (!cmdstr) break;
  438. continue;
  439. }
  440. /* parse the command
  441. * format is: [addr][,addr][!]cmd
  442. * |----||-----||-|
  443. * part1 part2 part3
  444. */
  445. sed_cmd = xzalloc(sizeof(sed_cmd_t));
  446. /* first part (if present) is an address: either a '$', a number or a /regex/ */
  447. cmdstr += get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
  448. /* second part (if present) will begin with a comma */
  449. if (*cmdstr == ',') {
  450. int idx;
  451. cmdstr++;
  452. idx = get_address(cmdstr, &sed_cmd->end_line, &sed_cmd->end_match);
  453. if (!idx)
  454. bb_error_msg_and_die("no address after comma");
  455. cmdstr += idx;
  456. }
  457. /* skip whitespace before the command */
  458. cmdstr = skip_whitespace(cmdstr);
  459. /* Check for inversion flag */
  460. if (*cmdstr == '!') {
  461. sed_cmd->invert = 1;
  462. cmdstr++;
  463. /* skip whitespace before the command */
  464. cmdstr = skip_whitespace(cmdstr);
  465. }
  466. /* last part (mandatory) will be a command */
  467. if (!*cmdstr)
  468. bb_error_msg_and_die("missing command");
  469. sed_cmd->cmd = *(cmdstr++);
  470. cmdstr = parse_cmd_args(sed_cmd, cmdstr);
  471. /* Add the command to the command array */
  472. G.sed_cmd_tail->next = sed_cmd;
  473. G.sed_cmd_tail = G.sed_cmd_tail->next;
  474. }
  475. /* If we glued multiple lines together, free the memory. */
  476. free(G.add_cmd_line);
  477. G.add_cmd_line = NULL;
  478. }
  479. /* Append to a string, reallocating memory as necessary. */
  480. #define PIPE_GROW 64
  481. static void pipe_putc(char c)
  482. {
  483. if (G.pipeline.idx == G.pipeline.len) {
  484. G.pipeline.buf = xrealloc(G.pipeline.buf,
  485. G.pipeline.len + PIPE_GROW);
  486. G.pipeline.len += PIPE_GROW;
  487. }
  488. G.pipeline.buf[G.pipeline.idx++] = c;
  489. }
  490. static void do_subst_w_backrefs(char *line, char *replace)
  491. {
  492. int i,j;
  493. /* go through the replacement string */
  494. for (i = 0; replace[i]; i++) {
  495. /* if we find a backreference (\1, \2, etc.) print the backref'ed * text */
  496. if (replace[i] == '\\') {
  497. unsigned backref = replace[++i] - '0';
  498. if (backref <= 9) {
  499. /* print out the text held in G.regmatch[backref] */
  500. if (G.regmatch[backref].rm_so != -1) {
  501. j = G.regmatch[backref].rm_so;
  502. while (j < G.regmatch[backref].rm_eo)
  503. pipe_putc(line[j++]);
  504. }
  505. continue;
  506. }
  507. /* I _think_ it is impossible to get '\' to be
  508. * the last char in replace string. Thus we dont check
  509. * for replace[i] == NUL. (counterexample anyone?) */
  510. /* if we find a backslash escaped character, print the character */
  511. pipe_putc(replace[i]);
  512. continue;
  513. }
  514. /* if we find an unescaped '&' print out the whole matched text. */
  515. if (replace[i] == '&') {
  516. j = G.regmatch[0].rm_so;
  517. while (j < G.regmatch[0].rm_eo)
  518. pipe_putc(line[j++]);
  519. continue;
  520. }
  521. /* Otherwise just output the character. */
  522. pipe_putc(replace[i]);
  523. }
  524. }
  525. static int do_subst_command(sed_cmd_t *sed_cmd, char **line)
  526. {
  527. char *oldline = *line;
  528. int altered = 0;
  529. int match_count = 0;
  530. regex_t *current_regex;
  531. /* Handle empty regex. */
  532. if (sed_cmd->sub_match == NULL) {
  533. current_regex = G.previous_regex_ptr;
  534. if (!current_regex)
  535. bb_error_msg_and_die("no previous regexp");
  536. } else
  537. G.previous_regex_ptr = current_regex = sed_cmd->sub_match;
  538. /* Find the first match */
  539. if (REG_NOMATCH == regexec(current_regex, oldline, 10, G.regmatch, 0))
  540. return 0;
  541. /* Initialize temporary output buffer. */
  542. G.pipeline.buf = xmalloc(PIPE_GROW);
  543. G.pipeline.len = PIPE_GROW;
  544. G.pipeline.idx = 0;
  545. /* Now loop through, substituting for matches */
  546. do {
  547. int i;
  548. /* Work around bug in glibc regexec, demonstrated by:
  549. echo " a.b" | busybox sed 's [^ .]* x g'
  550. The match_count check is so not to break
  551. echo "hi" | busybox sed 's/^/!/g' */
  552. if (!G.regmatch[0].rm_so && !G.regmatch[0].rm_eo && match_count) {
  553. pipe_putc(*oldline++);
  554. continue;
  555. }
  556. match_count++;
  557. /* If we aren't interested in this match, output old line to
  558. end of match and continue */
  559. if (sed_cmd->which_match && sed_cmd->which_match != match_count) {
  560. for (i = 0; i < G.regmatch[0].rm_eo; i++)
  561. pipe_putc(*oldline++);
  562. continue;
  563. }
  564. /* print everything before the match */
  565. for (i = 0; i < G.regmatch[0].rm_so; i++)
  566. pipe_putc(oldline[i]);
  567. /* then print the substitution string */
  568. do_subst_w_backrefs(oldline, sed_cmd->string);
  569. /* advance past the match */
  570. oldline += G.regmatch[0].rm_eo;
  571. /* flag that something has changed */
  572. altered++;
  573. /* if we're not doing this globally, get out now */
  574. if (sed_cmd->which_match) break;
  575. } while (*oldline && (regexec(current_regex, oldline, 10, G.regmatch, 0) != REG_NOMATCH));
  576. /* Copy rest of string into output pipeline */
  577. while (*oldline)
  578. pipe_putc(*oldline++);
  579. pipe_putc(0);
  580. free(*line);
  581. *line = G.pipeline.buf;
  582. return altered;
  583. }
  584. /* Set command pointer to point to this label. (Does not handle null label.) */
  585. static sed_cmd_t *branch_to(char *label)
  586. {
  587. sed_cmd_t *sed_cmd;
  588. for (sed_cmd = G.sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
  589. if (sed_cmd->cmd == ':' && sed_cmd->string && !strcmp(sed_cmd->string, label)) {
  590. return sed_cmd;
  591. }
  592. }
  593. bb_error_msg_and_die("can't find label for jump to '%s'", label);
  594. }
  595. static void append(char *s)
  596. {
  597. llist_add_to_end(&G.append_head, xstrdup(s));
  598. }
  599. static void flush_append(void)
  600. {
  601. char *data;
  602. /* Output appended lines. */
  603. while ((data = (char *)llist_pop(&G.append_head))) {
  604. fprintf(G.nonstdout, "%s\n", data);
  605. free(data);
  606. }
  607. }
  608. static void add_input_file(FILE *file)
  609. {
  610. G.input_file_list = xrealloc(G.input_file_list,
  611. (G.input_file_count + 1) * sizeof(FILE *));
  612. G.input_file_list[G.input_file_count++] = file;
  613. }
  614. /* Get next line of input from G.input_file_list, flushing append buffer and
  615. * noting if we ran out of files without a newline on the last line we read.
  616. */
  617. enum {
  618. NO_EOL_CHAR = 1,
  619. LAST_IS_NUL = 2,
  620. };
  621. static char *get_next_line(char *gets_char)
  622. {
  623. char *temp = NULL;
  624. int len;
  625. char gc;
  626. flush_append();
  627. /* will be returned if last line in the file
  628. * doesn't end with either '\n' or '\0' */
  629. gc = NO_EOL_CHAR;
  630. while (G.current_input_file < G.input_file_count) {
  631. FILE *fp = G.input_file_list[G.current_input_file];
  632. /* Read line up to a newline or NUL byte, inclusive,
  633. * return malloc'ed char[]. length of the chunk read
  634. * is stored in len. NULL if EOF/error */
  635. temp = bb_get_chunk_from_file(fp, &len);
  636. if (temp) {
  637. /* len > 0 here, it's ok to do temp[len-1] */
  638. char c = temp[len-1];
  639. if (c == '\n' || c == '\0') {
  640. temp[len-1] = '\0';
  641. gc = c;
  642. if (c == '\0') {
  643. int ch = fgetc(fp);
  644. if (ch != EOF)
  645. ungetc(ch, fp);
  646. else
  647. gc = LAST_IS_NUL;
  648. }
  649. }
  650. /* else we put NO_EOL_CHAR into *gets_char */
  651. break;
  652. /* NB: I had the idea of peeking next file(s) and returning
  653. * NO_EOL_CHAR only if it is the *last* non-empty
  654. * input file. But there is a case where this won't work:
  655. * file1: "a woo\nb woo"
  656. * file2: "c no\nd no"
  657. * sed -ne 's/woo/bang/p' input1 input2 => "a bang\nb bang"
  658. * (note: *no* newline after "b bang"!) */
  659. }
  660. /* Close this file and advance to next one */
  661. fclose(fp);
  662. G.current_input_file++;
  663. }
  664. *gets_char = gc;
  665. return temp;
  666. }
  667. /* Output line of text. */
  668. /* Note:
  669. * The tricks with NO_EOL_CHAR and last_puts_char are there to emulate gnu sed.
  670. * Without them, we had this:
  671. * echo -n thingy >z1
  672. * echo -n again >z2
  673. * >znull
  674. * sed "s/i/z/" z1 z2 znull | hexdump -vC
  675. * output:
  676. * gnu sed 4.1.5:
  677. * 00000000 74 68 7a 6e 67 79 0a 61 67 61 7a 6e |thzngy.agazn|
  678. * bbox:
  679. * 00000000 74 68 7a 6e 67 79 61 67 61 7a 6e |thzngyagazn|
  680. */
  681. static void puts_maybe_newline(char *s, FILE *file, char *last_puts_char, char last_gets_char)
  682. {
  683. char lpc = *last_puts_char;
  684. /* Need to insert a '\n' between two files because first file's
  685. * last line wasn't terminated? */
  686. if (lpc != '\n' && lpc != '\0') {
  687. fputc('\n', file);
  688. lpc = '\n';
  689. }
  690. fputs(s, file);
  691. /* 'x' - just something which is not '\n', '\0' or NO_EOL_CHAR */
  692. if (s[0])
  693. lpc = 'x';
  694. /* had trailing '\0' and it was last char of file? */
  695. if (last_gets_char == LAST_IS_NUL) {
  696. fputc('\0', file);
  697. lpc = 'x'; /* */
  698. } else
  699. /* had trailing '\n' or '\0'? */
  700. if (last_gets_char != NO_EOL_CHAR) {
  701. fputc(last_gets_char, file);
  702. lpc = last_gets_char;
  703. }
  704. if (ferror(file)) {
  705. xfunc_error_retval = 4; /* It's what gnu sed exits with... */
  706. bb_error_msg_and_die(bb_msg_write_error);
  707. }
  708. *last_puts_char = lpc;
  709. }
  710. #define sed_puts(s, n) (puts_maybe_newline(s, G.nonstdout, &last_puts_char, n))
  711. static int beg_match(sed_cmd_t *sed_cmd, const char *pattern_space)
  712. {
  713. int retval = sed_cmd->beg_match && !regexec(sed_cmd->beg_match, pattern_space, 0, NULL, 0);
  714. if (retval)
  715. G.previous_regex_ptr = sed_cmd->beg_match;
  716. return retval;
  717. }
  718. /* Process all the lines in all the files */
  719. static void process_files(void)
  720. {
  721. char *pattern_space, *next_line;
  722. int linenum = 0;
  723. char last_puts_char = '\n';
  724. char last_gets_char, next_gets_char;
  725. sed_cmd_t *sed_cmd;
  726. int substituted;
  727. /* Prime the pump */
  728. next_line = get_next_line(&next_gets_char);
  729. /* go through every line in each file */
  730. again:
  731. substituted = 0;
  732. /* Advance to next line. Stop if out of lines. */
  733. pattern_space = next_line;
  734. if (!pattern_space) return;
  735. last_gets_char = next_gets_char;
  736. /* Read one line in advance so we can act on the last line,
  737. * the '$' address */
  738. next_line = get_next_line(&next_gets_char);
  739. linenum++;
  740. restart:
  741. /* for every line, go through all the commands */
  742. for (sed_cmd = G.sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
  743. int old_matched, matched;
  744. old_matched = sed_cmd->in_match;
  745. /* Determine if this command matches this line: */
  746. /* Are we continuing a previous multi-line match? */
  747. sed_cmd->in_match = sed_cmd->in_match
  748. /* Or is no range necessary? */
  749. || (!sed_cmd->beg_line && !sed_cmd->end_line
  750. && !sed_cmd->beg_match && !sed_cmd->end_match)
  751. /* Or did we match the start of a numerical range? */
  752. || (sed_cmd->beg_line > 0 && (sed_cmd->beg_line == linenum))
  753. /* Or does this line match our begin address regex? */
  754. || (beg_match(sed_cmd, pattern_space))
  755. /* Or did we match last line of input? */
  756. || (sed_cmd->beg_line == -1 && next_line == NULL);
  757. /* Snapshot the value */
  758. matched = sed_cmd->in_match;
  759. /* Is this line the end of the current match? */
  760. if (matched) {
  761. sed_cmd->in_match = !(
  762. /* has the ending line come, or is this a single address command? */
  763. (sed_cmd->end_line ?
  764. sed_cmd->end_line == -1 ?
  765. !next_line
  766. : (sed_cmd->end_line <= linenum)
  767. : !sed_cmd->end_match
  768. )
  769. /* or does this line matches our last address regex */
  770. || (sed_cmd->end_match && old_matched
  771. && (regexec(sed_cmd->end_match,
  772. pattern_space, 0, NULL, 0) == 0))
  773. );
  774. }
  775. /* Skip blocks of commands we didn't match. */
  776. if (sed_cmd->cmd == '{') {
  777. if (sed_cmd->invert ? matched : !matched) {
  778. while (sed_cmd->cmd != '}') {
  779. sed_cmd = sed_cmd->next;
  780. if (!sed_cmd)
  781. bb_error_msg_and_die("unterminated {");
  782. }
  783. }
  784. continue;
  785. }
  786. /* Okay, so did this line match? */
  787. if (sed_cmd->invert ? !matched : matched) {
  788. /* Update last used regex in case a blank substitute BRE is found */
  789. if (sed_cmd->beg_match) {
  790. G.previous_regex_ptr = sed_cmd->beg_match;
  791. }
  792. /* actual sedding */
  793. switch (sed_cmd->cmd) {
  794. /* Print line number */
  795. case '=':
  796. fprintf(G.nonstdout, "%d\n", linenum);
  797. break;
  798. /* Write the current pattern space up to the first newline */
  799. case 'P':
  800. {
  801. char *tmp = strchr(pattern_space, '\n');
  802. if (tmp) {
  803. *tmp = '\0';
  804. /* TODO: explain why '\n' below */
  805. sed_puts(pattern_space, '\n');
  806. *tmp = '\n';
  807. break;
  808. }
  809. /* Fall Through */
  810. }
  811. /* Write the current pattern space to output */
  812. case 'p':
  813. /* NB: we print this _before_ the last line
  814. * (of current file) is printed. Even if
  815. * that line is nonterminated, we print
  816. * '\n' here (gnu sed does the same) */
  817. sed_puts(pattern_space, '\n');
  818. break;
  819. /* Delete up through first newline */
  820. case 'D':
  821. {
  822. char *tmp = strchr(pattern_space, '\n');
  823. if (tmp) {
  824. tmp = xstrdup(tmp+1);
  825. free(pattern_space);
  826. pattern_space = tmp;
  827. goto restart;
  828. }
  829. }
  830. /* discard this line. */
  831. case 'd':
  832. goto discard_line;
  833. /* Substitute with regex */
  834. case 's':
  835. if (!do_subst_command(sed_cmd, &pattern_space))
  836. break;
  837. substituted |= 1;
  838. /* handle p option */
  839. if (sed_cmd->sub_p)
  840. sed_puts(pattern_space, last_gets_char);
  841. /* handle w option */
  842. if (sed_cmd->sw_file)
  843. puts_maybe_newline(
  844. pattern_space, sed_cmd->sw_file,
  845. &sed_cmd->sw_last_char, last_gets_char);
  846. break;
  847. /* Append line to linked list to be printed later */
  848. case 'a':
  849. append(sed_cmd->string);
  850. break;
  851. /* Insert text before this line */
  852. case 'i':
  853. sed_puts(sed_cmd->string, '\n');
  854. break;
  855. /* Cut and paste text (replace) */
  856. case 'c':
  857. /* Only triggers on last line of a matching range. */
  858. if (!sed_cmd->in_match)
  859. sed_puts(sed_cmd->string, NO_EOL_CHAR);
  860. goto discard_line;
  861. /* Read file, append contents to output */
  862. case 'r':
  863. {
  864. FILE *rfile;
  865. rfile = fopen(sed_cmd->string, "r");
  866. if (rfile) {
  867. char *line;
  868. while ((line = xmalloc_getline(rfile))
  869. != NULL)
  870. append(line);
  871. xprint_and_close_file(rfile);
  872. }
  873. break;
  874. }
  875. /* Write pattern space to file. */
  876. case 'w':
  877. puts_maybe_newline(
  878. pattern_space, sed_cmd->sw_file,
  879. &sed_cmd->sw_last_char, last_gets_char);
  880. break;
  881. /* Read next line from input */
  882. case 'n':
  883. if (!G.be_quiet)
  884. sed_puts(pattern_space, last_gets_char);
  885. if (next_line) {
  886. free(pattern_space);
  887. pattern_space = next_line;
  888. last_gets_char = next_gets_char;
  889. next_line = get_next_line(&next_gets_char);
  890. linenum++;
  891. break;
  892. }
  893. /* fall through */
  894. /* Quit. End of script, end of input. */
  895. case 'q':
  896. /* Exit the outer while loop */
  897. free(next_line);
  898. next_line = NULL;
  899. goto discard_commands;
  900. /* Append the next line to the current line */
  901. case 'N':
  902. {
  903. int len;
  904. /* If no next line, jump to end of script and exit. */
  905. if (next_line == NULL) {
  906. /* Jump to end of script and exit */
  907. free(next_line);
  908. next_line = NULL;
  909. goto discard_line;
  910. /* append next_line, read new next_line. */
  911. }
  912. len = strlen(pattern_space);
  913. pattern_space = realloc(pattern_space, len + strlen(next_line) + 2);
  914. pattern_space[len] = '\n';
  915. strcpy(pattern_space + len+1, next_line);
  916. last_gets_char = next_gets_char;
  917. next_line = get_next_line(&next_gets_char);
  918. linenum++;
  919. break;
  920. }
  921. /* Test/branch if substitution occurred */
  922. case 't':
  923. if (!substituted) break;
  924. substituted = 0;
  925. /* Fall through */
  926. /* Test/branch if substitution didn't occur */
  927. case 'T':
  928. if (substituted) break;
  929. /* Fall through */
  930. /* Branch to label */
  931. case 'b':
  932. if (!sed_cmd->string) goto discard_commands;
  933. else sed_cmd = branch_to(sed_cmd->string);
  934. break;
  935. /* Transliterate characters */
  936. case 'y':
  937. {
  938. int i, j;
  939. for (i = 0; pattern_space[i]; i++) {
  940. for (j = 0; sed_cmd->string[j]; j += 2) {
  941. if (pattern_space[i] == sed_cmd->string[j]) {
  942. pattern_space[i] = sed_cmd->string[j + 1];
  943. break;
  944. }
  945. }
  946. }
  947. break;
  948. }
  949. case 'g': /* Replace pattern space with hold space */
  950. free(pattern_space);
  951. pattern_space = xstrdup(G.hold_space ? G.hold_space : "");
  952. break;
  953. case 'G': /* Append newline and hold space to pattern space */
  954. {
  955. int pattern_space_size = 2;
  956. int hold_space_size = 0;
  957. if (pattern_space)
  958. pattern_space_size += strlen(pattern_space);
  959. if (G.hold_space)
  960. hold_space_size = strlen(G.hold_space);
  961. pattern_space = xrealloc(pattern_space,
  962. pattern_space_size + hold_space_size);
  963. if (pattern_space_size == 2)
  964. pattern_space[0] = 0;
  965. strcat(pattern_space, "\n");
  966. if (G.hold_space)
  967. strcat(pattern_space, G.hold_space);
  968. last_gets_char = '\n';
  969. break;
  970. }
  971. case 'h': /* Replace hold space with pattern space */
  972. free(G.hold_space);
  973. G.hold_space = xstrdup(pattern_space);
  974. break;
  975. case 'H': /* Append newline and pattern space to hold space */
  976. {
  977. int hold_space_size = 2;
  978. int pattern_space_size = 0;
  979. if (G.hold_space)
  980. hold_space_size += strlen(G.hold_space);
  981. if (pattern_space)
  982. pattern_space_size = strlen(pattern_space);
  983. G.hold_space = xrealloc(G.hold_space,
  984. hold_space_size + pattern_space_size);
  985. if (hold_space_size == 2)
  986. *G.hold_space = 0;
  987. strcat(G.hold_space, "\n");
  988. if (pattern_space)
  989. strcat(G.hold_space, pattern_space);
  990. break;
  991. }
  992. case 'x': /* Exchange hold and pattern space */
  993. {
  994. char *tmp = pattern_space;
  995. pattern_space = G.hold_space ? : xzalloc(1);
  996. last_gets_char = '\n';
  997. G.hold_space = tmp;
  998. break;
  999. }
  1000. }
  1001. }
  1002. }
  1003. /*
  1004. * exit point from sedding...
  1005. */
  1006. discard_commands:
  1007. /* we will print the line unless we were told to be quiet ('-n')
  1008. or if the line was suppressed (ala 'd'elete) */
  1009. if (!G.be_quiet)
  1010. sed_puts(pattern_space, last_gets_char);
  1011. /* Delete and such jump here. */
  1012. discard_line:
  1013. flush_append();
  1014. free(pattern_space);
  1015. goto again;
  1016. }
  1017. /* It is possible to have a command line argument with embedded
  1018. * newlines. This counts as multiple command lines.
  1019. * However, newline can be escaped: 's/e/z\<newline>z/'
  1020. * We check for this.
  1021. */
  1022. static void add_cmd_block(char *cmdstr)
  1023. {
  1024. char *sv, *eol;
  1025. cmdstr = sv = xstrdup(cmdstr);
  1026. do {
  1027. eol = strchr(cmdstr, '\n');
  1028. next:
  1029. if (eol) {
  1030. /* Count preceding slashes */
  1031. int slashes = 0;
  1032. char *sl = eol;
  1033. while (sl != cmdstr && *--sl == '\\')
  1034. slashes++;
  1035. /* Odd number of preceding slashes - newline is escaped */
  1036. if (slashes & 1) {
  1037. strcpy(eol-1, eol);
  1038. eol = strchr(eol, '\n');
  1039. goto next;
  1040. }
  1041. *eol = '\0';
  1042. }
  1043. add_cmd(cmdstr);
  1044. cmdstr = eol + 1;
  1045. } while (eol);
  1046. free(sv);
  1047. }
  1048. int sed_main(int argc, char **argv);
  1049. int sed_main(int argc, char **argv)
  1050. {
  1051. enum {
  1052. OPT_in_place = 1 << 0,
  1053. };
  1054. unsigned opt;
  1055. llist_t *opt_e, *opt_f;
  1056. int status = EXIT_SUCCESS;
  1057. INIT_G();
  1058. /* destroy command strings on exit */
  1059. if (ENABLE_FEATURE_CLEAN_UP) atexit(sed_free_and_close_stuff);
  1060. /* Lie to autoconf when it starts asking stupid questions. */
  1061. if (argc == 2 && !strcmp(argv[1], "--version")) {
  1062. puts("This is not GNU sed version 4.0");
  1063. return 0;
  1064. }
  1065. /* do normal option parsing */
  1066. opt_e = opt_f = NULL;
  1067. opt_complementary = "e::f::" /* can occur multiple times */
  1068. "nn"; /* count -n */
  1069. opt = getopt32(argv, "irne:f:", &opt_e, &opt_f,
  1070. &G.be_quiet); /* counter for -n */
  1071. argc -= optind;
  1072. argv += optind;
  1073. if (opt & OPT_in_place) { // -i
  1074. atexit(cleanup_outname);
  1075. }
  1076. if (opt & 0x2) G.regex_type |= REG_EXTENDED; // -r
  1077. //if (opt & 0x4) G.be_quiet++; // -n
  1078. while (opt_e) { // -e
  1079. add_cmd_block(opt_e->data);
  1080. opt_e = opt_e->link;
  1081. /* we leak opt_e here... */
  1082. }
  1083. while (opt_f) { // -f
  1084. char *line;
  1085. FILE *cmdfile;
  1086. cmdfile = xfopen(opt_f->data, "r");
  1087. while ((line = xmalloc_getline(cmdfile)) != NULL) {
  1088. add_cmd(line);
  1089. free(line);
  1090. }
  1091. fclose(cmdfile);
  1092. opt_f = opt_f->link;
  1093. /* we leak opt_f here... */
  1094. }
  1095. /* if we didn't get a pattern from -e or -f, use argv[0] */
  1096. if (!(opt & 0x18)) {
  1097. if (!argc)
  1098. bb_show_usage();
  1099. add_cmd_block(*argv++);
  1100. argc--;
  1101. }
  1102. /* Flush any unfinished commands. */
  1103. add_cmd("");
  1104. /* By default, we write to stdout */
  1105. G.nonstdout = stdout;
  1106. /* argv[0..(argc-1)] should be names of file to process. If no
  1107. * files were specified or '-' was specified, take input from stdin.
  1108. * Otherwise, we process all the files specified. */
  1109. if (argv[0] == NULL) {
  1110. if (opt & OPT_in_place)
  1111. bb_error_msg_and_die(bb_msg_requires_arg, "-i");
  1112. add_input_file(stdin);
  1113. process_files();
  1114. } else {
  1115. int i;
  1116. FILE *file;
  1117. for (i = 0; i < argc; i++) {
  1118. struct stat statbuf;
  1119. int nonstdoutfd;
  1120. if (LONE_DASH(argv[i]) && !(opt & OPT_in_place)) {
  1121. add_input_file(stdin);
  1122. process_files();
  1123. continue;
  1124. }
  1125. file = fopen_or_warn(argv[i], "r");
  1126. if (!file) {
  1127. status = EXIT_FAILURE;
  1128. continue;
  1129. }
  1130. if (!(opt & OPT_in_place)) {
  1131. add_input_file(file);
  1132. continue;
  1133. }
  1134. G.outname = xasprintf("%sXXXXXX", argv[i]);
  1135. nonstdoutfd = mkstemp(G.outname);
  1136. if (-1 == nonstdoutfd)
  1137. bb_perror_msg_and_die("cannot create temp file %s", G.outname);
  1138. G.nonstdout = fdopen(nonstdoutfd, "w");
  1139. /* Set permissions of output file */
  1140. fstat(fileno(file), &statbuf);
  1141. fchmod(nonstdoutfd, statbuf.st_mode);
  1142. add_input_file(file);
  1143. process_files();
  1144. fclose(G.nonstdout);
  1145. G.nonstdout = stdout;
  1146. /* unlink(argv[i]); */
  1147. // FIXME: error check / message?
  1148. rename(G.outname, argv[i]);
  1149. free(G.outname);
  1150. G.outname = 0;
  1151. }
  1152. if (G.input_file_count > G.current_input_file)
  1153. process_files();
  1154. }
  1155. return status;
  1156. }