# /* * * * The information in this document is subject to change * without notice and should not be construed as a commitment * by Digital Equipment Corporation or by DECUS. * * Neither Digital Equipment Corporation, DECUS, nor the authors * assume any responsibility for the use or reliability of this * document or the described software. * * Copyright (C) 1980, DECUS * * * General permission to copy or modify, but not for profit, is * hereby granted, provided that the above copyright notice is * included and reference made to the fact that reproduction * privileges were granted by DECUS. * */ /* * Kwik index generator. * * Usage: * * kwik [-r] [-w width] [-t offset] [-s] [-x excludefile] [file ...] * * Description * * Kwik constructs a keyword in context (kwik) index using the data * in the named file and writes the resulting index to the standard * output. If no files are given the standard input is used; kwik * may be used as a filter. * * The unit of information to kwik is the word. Words are text that * begin with letters or numbers and continue to "white space". * Note that kwik does not know about hyphenation. * * Normally the page width is 80 characters. The '-w' option * changes the width to a user specified value. * * The '-r' option makes the kwik index in reverse order. * * The '-s' option allows indexing of words in the following stop- * list: * * a and for in on to * an by from of the with * * Normally, these words are ignored. * * By using the '-x' option, this list may be extended or replaced: * * -x file extend the default stoplist * -s -x file replace the default stoplist * * Note that -s must preceed -x if extension is desired. * * The '-t' option allows construction of index tables. Input lines * have the format "indextext." Output will be in the format: * * index rotated_text * * where "rotated_text" will begin at column "offset". It is the * user's responsibility to determine that all index entries are no * longer than "offset-1" bytes long. * * the '-x' option allows inclusion of an exclude file (generally * containing function words). If the index word is contained in * the exclude file, it will not be indexed. The exclude file need * not be in any particluar order. There may be several such files. * * Diagnostics * * "Kwik: Bad width" * if the line width is less than 2 or greater than 128. * "Kwik: Cannot open" * if a file cannot be accessed for reading. * "Sort: Cannot create temp. file" * if the required file cannot be created. * "Sort: Cannot reopen temp. file" * if the required file cannot be reopened. * * Author: * * David Conroy. * Revised by Martin Minow * Format change by Bob Denny * * Bugs * */ #include #define FOLD '\t' #define NBUF 128 extern int sort_r; /* Reverse kwik */ static int width = 80; static int sflag = 1; /* Stop list (on) */ static char *stopfile = ""; /* User's stop list */ static int tflag = 0; /* Table format flag */ static int offset = 0; /* Table format offset */ static char inbuf[NBUF]; static char outbuf[NBUF]; static char tbuf[NBUF]; /* * This table contains '1' for bytes that may begin indexed words. * This may be defined as the regular expression "[0-9A-Za-z]" */ static char ok[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0.. 15 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32.. 47 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 48.. 63 0-9 */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 64.. 79 A-O */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 80.. 95 P-Z */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 96..111 a-o */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 /* 112..127 p-z */ }; /* * Stop words are stored in a sorted bucket table. */ struct stoplist { struct stoplist *next; char *word; }; /* * Hack for the Decus compiler */ static struct stoplist st_and { NULL, "and" }; static struct stoplist st_an { &st_and, "an" }; static struct stoplist st_a { &st_an, "a" }; #define astop st_a static struct stoplist st_by { NULL, "by" }; #define bstop st_by static struct stoplist st_from { NULL, "from" }; static struct stoplist st_for { &st_from, "for" }; #define fstop st_for static struct stoplist st_in { NULL, "in" }; #define istop st_in static struct stoplist st_on { NULL, "on" }; static struct stoplist st_of { &st_on, "of" }; #define ostop st_of static struct stoplist st_to { NULL, "to" }; static struct stoplist st_the { &st_to, "the" }; #define tstop st_the static struct stoplist st_with { NULL, "with" }; #define wstop st_with #define NSTOP 128-' ' static struct stoplist *stoplist[NSTOP] { /* 32.. 39 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 40.. 47 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 48.. 55 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 56.. 63 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 64.. 71 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 72.. 79 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 80.. 87 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 88.. 95 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 96..103 */ NULL, &astop, &bstop, NULL, NULL, NULL, &fstop, NULL, /* 104..111 */ NULL, &istop, NULL, NULL, NULL, NULL, NULL, &ostop, /* 112..119 */ NULL, NULL, NULL, NULL, &tstop, NULL, NULL, &wstop, /* 120..127 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; main(argc, argv) char *argv[]; { register char *cp; register c, i; FILE *fp; int nf; nf = argc-1; for (i=1; i= argc) usage(); width = getv(argv[i],2,NBUF,"width"); --nf; argv[i] = NULL; break; case 's': sflag = 0; break; case 't': if (++i >= argc) usage(); tflag++; offset = getv(argv[i],1,NBUF,"offset"); --nf; argv[i] = NULL; break; case 'x': if (++i >= argc) usage(); stopfile = argv[i]; --nf; argv[i] = NULL; break; default: usage(); } } } getexclude(); /* Initialize stoplist */ width -= offset; /* Offset is for index */ offset -= 1; /* One byte follows the index */ if (nf <= 0) rotate(stdin); else { for (i=1; i < argc; ++i) { if ((cp = argv[i]) == NULL) continue; if ((fp = freopen(cp, "r", stdin)) == NULL) { fprintf(stderr, "kwik: %s: cannot open. Continuing\n", cp); continue; } rotate(fp); } } sorta(NULL); unrotate(); } usage() { fprintf(stderr, "-KWIK-parameter error, usage is\n"); fprintf(stderr, "\tkwik\t[-r]\t\tReverse kwik\n"); fprintf(stderr, "\t\t[-s]\t\tUse standard stoplist\n"); fprintf(stderr, "\t\t[-w width]\tOutput line width\n"); fprintf(stderr, "\t\t[-t offset]\tInput in index format\n"); fprintf(stderr, "\t\t[-x exlude]\tExclude these words from index\n"); fprintf(stderr, "\t\t[file ...]\tFiles to be processed\n"); error("?KWIK-E-Cannot proceed"); } getv(arg, min, max, who) char *arg; /* What to convert */ int min; /* Minimun acceptable value */ int max; /* Maximum acceptable value */ char *who; /* For error printout */ { register int result; result = atoi(arg); if (result < min || result > max) { error("?KWIK-E-bad %s %d, minimum = %d, maximum = %d\n", who, result, min, max); } return(result); } getexclude() /* * Store words to exclude */ { FILE *fp; register struct stoplist **stopp; /* * If -s wasn't given, erase current stoplist */ if (!sflag) { for (stopp = stoplist; stopp < &stoplist[NSTOP];) *stopp++ = NULL; } if (*stopfile == 0) return; if ((fp = fopen(stopfile, "r")) == NULL) error("?KWIK-E-can't open exclude file \"%s\"\n", stopfile); while (fgetss(inbuf, sizeof inbuf, fp) != NULL) { saveexclude(inbuf); } fclose(fp); } saveexclude(what) char *what; /* What to save */ { register char *p; register struct stoplist *stp; struct stoplist **stopp; register int c; struct stoplist *newstop; /* * Force line to lowercase */ for (p = what; (c = *p) != 0;) *p++ = tolower(c); stopp = &stoplist[*what - ' ']; if (stopp < &stoplist || stopp >= &stoplist[NSTOP]) error("Illegal exclusion \"%s\"\n", what); if ((newstop = malloc(sizeof (struct stoplist))) == NULL || (p = malloc(strlen(what) + 1)) == NULL) error("Out of space in saveexclude"); cpystr((newstop->word = p), what); if ((stp = *stopp) == NULL) { *stopp = newstop; newstop->next = NULL; } else { while(strcmp(stp->word, what) <= 0 && stp->next != NULL) stp = stp->next; newstop->next = stp->next; stp->next = newstop; } } testexclude(what) register char *what; /* Is it excluded? */ /* * Return true if is in the exclude table. Note: is * guaranteed to be in lowercase and what[0] is guaranteed to be * a reasonable printing character. */ { register struct stoplist *stp; register int test; test = *what - ' '; if (test < 0 || test >= NSTOP) error("Bug: illegal testexclude \"%s\"\n", what); stp = stoplist[test]; while (stp != NULL) { if ((test = strcmp(stp->word, what)) > 0) return(FALSE); else if (test == 0) return(TRUE); else stp = stp->next; } return(FALSE); } rotate(fp) FILE *fp; { register char *p; register char *tp; register char *inp; while (fgetss(inbuf, sizeof inbuf, fp) != NULL) { inp = inbuf; /* * If index mode, get the index entry */ if (tflag) { for (tp = tbuf; *inp != 0 && *inp != '\t';) *tp++ = *inp++; *tp = 0; if (*inp == 0) fprintf(stderr, "%KWIK-W-no index for \"%s\"\n", inbuf); else inp++; } /* * Erase junk from the rest of the line */ for (p = inp; *p != 0; p++) { if (*p < ' ') *p = ' '; } /* * Skip to a word, output it, skip to the end of the word */ for (p = inp;;) { while (!ok[*p] && *p != 0) p++; if (*p == 0) break; stuff(p, inp); while (*p > ' ') p++; } } } stuff(fold_point, start) char *fold_point; /* Where to rotate from */ char *start; /* Start of the text (if indexing) */ /* * Stuff this entry (assuming it isn't excluded) */ { register char *p; register char *bp; register c; p = fold_point; bp = outbuf; /* * Get the sort argument, test against the exclusion buffer */ while ((c = *p++) > ' ') *bp++ = tolower(c); *bp = 0; if (testexclude(outbuf)) return; *bp++ = FOLD; /* * Copy the input from the rotate point to the end of the line */ bp = cpystr(bp, fold_point); *bp++ = FOLD; /* * Copy the rest of the input (from the start to the rotate point) */ p = start; while (p < fold_point) *bp++ = *p++; /* * If indexing, append the index entry */ if (tflag) { *bp++ = FOLD; bp = cpystr(bp, tbuf); } *bp = 0; sorta(outbuf); } unrotate() { register char *in; register char *out; register c; char *start; char *rest; char *bufend; char *middle; char *next(); bufend = &outbuf[width]; middle = &outbuf[(width - 1) / 2]; /* Fold here */ while (sorto(inbuf) != NULL) { for (in = inbuf; (c = *in++) && c != FOLD;); if (c == 0) error("Bug: no first fold"); /* * Partition the text line */ start = in; /* start -> after fold */ while ((c = *in) && c != FOLD) in++; if (c == 0) error("Bug: missing second fold"); *in++ = 0; /* Terminate right side */ rest = in; while ((c = *in) != 0 && c != FOLD) in++; /* * Output the index */ if (tflag) { if (c == 0) error("Bug: missing third fold"); else { *in++ = 0; /* Terminate left side */ printf("%-?s ", offset, in); } } else { if (c != 0) error("Bug: extra fold"); } /* * Partition the line. At this point: * start -> line after the fold * rest -> line from start to fold * Clear the line and stuff the text into it */ out = &outbuf; while (out < bufend) *out++ = ' '; /* * Copy from "start" to the right half of the output buffer * This algorithm was taken from the Lawerence Livermore * tool kit. */ out = middle; for (in = start; c = *in; in++) { if (in > start && in[-1] == ' ') { if (next(1, start, in, out) >= bufend) out = outbuf; } if (out >= bufend) out = outbuf; *out++ = c; } /* * Copy from the end of the text to the middle (backwards) */ /**** out = middle; ****/ out = middle-1; /*RBD*/ *--out = '|'; /*RBD*/ --out; /*RBD*/ for (in = rest; *in != 0; in++); while (--in >= rest) { out--; if (in[1] == ' ') { if (next(-1, rest, in, out) < outbuf) out = bufend - 1; } if (out < outbuf) out = bufend - 1; *out = *in; } /* * Delete trailing blanks */ for (out = bufend; *--out == ' ' && *out >= outbuf;); out[1] = 0; printf("%s\n", outbuf); } } char * next(increment, edge, in, out) int increment; /* Which direction (+1 | -1) */ char *edge; /* Lower limit for in */ register char *in; /* From pointer */ register char *out; /* Output pointer */ { register int c; for (; in >= edge; in += increment) { if (*in == ' ' || *in == 0) break; out += increment; } return(out); }