/* * Kwik index generator. */ /*)BUILD $(FILES) = { kwik sorts } $(STACK) = 4000 $(TKBOPTIONS) = { TASK = ...KWI } */ #ifdef DOCUMENTATION title kwik Keyword in Context Index index Keyword in Context Index synopsis kwik [options] [file ...] description Kwik constructs a keyword in context (kwik) index using the data in the named files, writing the resulting index to the standard output. The standard input is read if no files are specified; kwik may be used as a filter. The following options are defined: .lm +16 .s.i -16;-s The kwik index normally excludes common (stop) words. Specifing the '-s' option empties the stop list, thus including the following words: .s.nf a by in to an for of the and from on with .s.f .s.i -16;-r Make the index in reverse alphabetic order. .s.i -16;-t offset This is used to build index tables. The input is entered in the following format: .s nameindex text .s The kwik index will be output with the name in the left hand column. The kwik'ed text then follows. The '-t' option takes a mandatory argument: the column at which the first byte of the kwik'ed text should be placed. For example, the index for the Decus library documentation was produced by the following command: .s.nf kwik -t 16 -w 64 outfile .f .s.i -16;-x file__name The named file contains a user-specified exclusion (stopword) list. The '-x' option may be repeated if multiple exclusion lists are needed. Note that the order of the '-s' and '-x' options is important: .lm +4 .s.i-4;kwik -x file .s The file contains an exclusion list, one word per line. Append the contents of the file to the default list. .s.i -4;kwik -s -x file .s Replace the default stoplist by the contents of the named file. .s.i -4;kwik -x file -s .s After reading the exclusion file, the entire stop list is erased. (This is not a useful procedure.) .lm -4 .s.i -16;-w width The output line width is normally 80 characters. The '-w' option changes it to a user-specified value. .lm -16 diagnostics .lm +8 .s.i -8;Usage ... .s Illegal options cause an extensive "help" message to be printed. .s.i -8;Bad (width | offset) ... .s An illegal or out-of-range number was given as a parameter to a '-w' or '-t' option. .s.i -8;Can't open exclude file .s.i -8;Illegal exclusion .s An exclude file text line began with a non-printing character. The line is ignored. .s.i -8;Out of space in saveexclude .s The program ran out of main memory when building the stop list. .s.i -8;No index for ... .s Kwik was invoked in '-t' mode. Unfortunately, the indicated line was not in the format "name##text". The line is ignored. .s.i -8;Cannot create temp file .s The sorts subroutine could not create its file. .s.i -8;Out of space .s The sorts subroutine filled the available disk space. .lm -8 author David Conroy, Martin Minow implementation Kwik is linked together with sorts.obj. .s The program must be linked with a large enough stack space so that the sorts routine can operate correctly. The following seems to work correctly: .s.nf tkb kwik=kwik,sorts,c:c/lb / stack = 2048 // .f bugs If an index line is too long to fit (in the kwik'ed part), the program overlays text on the line, logging the break showing where the beginning of the line was. #endif #include #define FOLD '\t' #define NBUF 128 #define FALSE 0 #define TRUE 1 #define EOS 0 extern int sort_r; /* Reverse kwik */ static int width = 80; static int sflag = 1; /* Stop list (on) */ static char *stopfile = ""; /* User's stop list */ static int tflag = 0; /* Table format flag */ static int offset = 0; /* Table format offset */ static char inbuf[NBUF]; static char outbuf[NBUF]; static char tbuf[NBUF]; /* * This table contains '1' for bytes that may begin indexed words. * This may be defined as the regular expression "[0-9A-Za-z]" */ static char ok[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0.. 15 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32.. 47 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 48.. 63 0-9 */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 64.. 79 A-O */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 80.. 95 P-Z */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 96..111 a-o */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 /* 112..127 p-z */ }; /* * Stop words are stored in a sorted bucket table. */ struct stoplist { struct stoplist *next; char *word; }; /* * Hack for the Decus compiler */ static struct stoplist st_and = { NULL, "and" }; static struct stoplist st_an = { &st_and, "an" }; static struct stoplist st_a = { &st_an, "a" }; #define astop st_a static struct stoplist st_by = { NULL, "by" }; #define bstop st_by static struct stoplist st_from = { NULL, "from" }; static struct stoplist st_for = { &st_from, "for" }; #define fstop st_for static struct stoplist st_in = { NULL, "in" }; #define istop st_in static struct stoplist st_on = { NULL, "on" }; static struct stoplist st_of = { &st_on, "of" }; #define ostop st_of static struct stoplist st_to = { NULL, "to" }; static struct stoplist st_the = { &st_to, "the" }; #define tstop st_the static struct stoplist st_with = { NULL, "with" }; #define wstop st_with #define NSTOP 128-' ' static struct stoplist *stoplist[NSTOP] = { /* 32.. 39 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 40.. 47 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 48.. 55 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 56.. 63 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 64.. 71 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 72.. 79 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 80.. 87 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 88.. 95 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 96..103 */ NULL, &astop, &bstop, NULL, NULL, NULL, &fstop, NULL, /* 104..111 */ NULL, &istop, NULL, NULL, NULL, NULL, NULL, &ostop, /* 112..119 */ NULL, NULL, NULL, NULL, &tstop, NULL, NULL, &wstop, /* 120..127 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; main(argc, argv) char *argv[]; { register char *cp; register c, i; FILE *fp; int nf; nf = argc-1; for (i=1; i= argc) usage(); width = getv(argv[i],2,NBUF,"width"); --nf; argv[i] = NULL; break; case 's': sflag = 0; break; case 't': if (++i >= argc) usage(); tflag++; offset = getv(argv[i],1,NBUF,"offset"); --nf; argv[i] = NULL; break; case 'x': if (++i >= argc) usage(); stopfile = argv[i]; --nf; argv[i] = NULL; break; default: usage(); } } } } getexclude(); /* Initialize stoplist */ width -= offset; /* Offset is for index */ offset -= 1; /* One byte follows the index */ if (nf <= 0) rotate(stdin); else { for (i=1; i < argc; ++i) { if ((cp = argv[i]) == NULL) continue; if ((fp = freopen(cp, "r", stdin)) == NULL) { fprintf(stderr, "kwik: %s: cannot open. Continuing\n", cp); continue; } rotate(fp); } } sorta(NULL); unrotate(); } usage() { fprintf(stderr, "-KWIK-parameter error, usage is\n"); fprintf(stderr, "\tkwik\t[-r]\t\tReverse kwik\n"); fprintf(stderr, "\t\t[-s]\t\tUse standard stoplist\n"); fprintf(stderr, "\t\t[-w width]\tOutput line width\n"); fprintf(stderr, "\t\t[-t offset]\tInput in index format\n"); fprintf(stderr, "\t\t[-x exlude]\tExclude these words from index\n"); fprintf(stderr, "\t\t[file ...]\tFiles to be processed\n"); error("?KWIK-E-Cannot proceed"); } getv(arg, min, max, who) char *arg; /* What to convert */ int min; /* Minimun acceptable value */ int max; /* Maximum acceptable value */ char *who; /* For error printout */ { register int result; result = atoi(arg); if (result < min || result > max) { error("?KWIK-E-bad %s %d, minimum = %d, maximum = %d\n", who, result, min, max); } return(result); } getexclude() /* * Store words to exclude */ { FILE *fp; register struct stoplist **stopp; /* * If -s wasn't given, erase current stoplist */ if (!sflag) { for (stopp = stoplist; stopp < &stoplist[NSTOP];) *stopp++ = NULL; } if (*stopfile == EOS) return; if ((fp = fopen(stopfile, "r")) == NULL) error("?KWIK-E-can't open exclude file \"%s\"\n", stopfile); while (fgetss(inbuf, sizeof inbuf, fp) != NULL) { saveexclude(inbuf); } fclose(fp); } saveexclude(what) char *what; /* What to save */ { register char *p; register struct stoplist *stp; struct stoplist **stopp; register int c; struct stoplist *newstop; /* * Force line to lowercase */ for (p = what; (c = *p) != EOS;) *p++ = tolower(c); stopp = &stoplist[*what - ' ']; if (stopp < &stoplist || stopp >= &stoplist[NSTOP]) error("Illegal exclusion \"%s\"\n", what); if ((newstop = malloc(sizeof (struct stoplist))) == NULL || (p = malloc(strlen(what) + 1)) == NULL) error("Out of space in saveexclude"); strcpy((newstop->word = p), what); if ((stp = *stopp) == NULL) { *stopp = newstop; newstop->next = NULL; } else { while(strcmp(stp->word, what) <= 0 && stp->next != NULL) stp = stp->next; newstop->next = stp->next; stp->next = newstop; } } testexclude(what) register char *what; /* Is it excluded? */ /* * Return true if is in the exclude table. Note: is * guaranteed to be in lowercase and what[0] is guaranteed to be * a reasonable printing character. */ { register struct stoplist *stp; register int test; test = *what - ' '; if (test < 0 || test >= NSTOP) error("Bug: illegal testexclude \"%s\"\n", what); stp = stoplist[test]; while (stp != NULL) { if ((test = strcmp(stp->word, what)) > 0) return(FALSE); else if (test == 0) return(TRUE); else stp = stp->next; } return(FALSE); } rotate(fp) FILE *fp; { register char *p; register char *tp; register char *inp; while (fgetss(inbuf, sizeof inbuf, fp) != NULL) { inp = inbuf; /* * If index mode, get the index entry */ if (tflag) { for (tp = tbuf; *inp != EOS && *inp != '\t';) *tp++ = *inp++; *tp = EOS; if (*inp == EOS) fprintf(stderr, "%KWIK-W-no index for \"%s\"\n", inbuf); else inp++; } /* * Erase junk from the rest of the line */ for (p = inp; *p != EOS; p++) { if (*p < ' ') *p = ' '; } /* * Skip to a word, output it, skip to the end of the word */ for (p = inp;;) { while (!ok[*p] && *p != EOS) p++; if (*p == EOS) break; stuff(p, inp); while (*p > ' ') p++; } } } stuff(fold_point, start) char *fold_point; /* Where to rotate from */ char *start; /* Start of the text (if indexing) */ /* * Stuff this entry (assuming it isn't excluded) */ { register char *p; register char *bp; register c; p = fold_point; bp = outbuf; /* * Get the sort argument, test against the exclusion buffer */ while ((c = *p++) > ' ') *bp++ = tolower(c); *bp = EOS; if (testexclude(outbuf)) return; *bp++ = FOLD; /* * Copy the input from the rotate point to the end of the line */ bp = cpystr(bp, fold_point); *bp++ = FOLD; /* * Copy the rest of the input (from the start to the rotate point) */ p = start; while (p < fold_point) *bp++ = *p++; /* * If indexing, append the index entry */ if (tflag) { *bp++ = FOLD; bp = cpystr(bp, tbuf); } *bp = EOS; sorta(outbuf); } unrotate() { register char *in; register char *out; register c; char *start; char *rest; char *bufend; char *middle; char *next(); long counter; /* For "first fold" debug */ bufend = &outbuf[width]; middle = &outbuf[(width - 1) / 2]; /* Fold here */ for (counter = 0; sorto(inbuf) != NULL; counter++) { for (in = inbuf; (c = *in++) != EOS && c != FOLD;) ; #if (1 == 1) /* * This is a crude work-around for a bug in * the Decus-C I/O system (I think). */ if (c == EOS && in == &inbuf[1] && counter == 0) continue; /* Hack bug in i/o system */ #endif if (c == EOS) { fprintf(stderr, "No first fold in %d byte record number %ld\n", strlen(inbuf), counter); fprintf(stderr, "%s\"\n", inbuf); error("Bug: no first fold"); } counter++; /* * Partition the text line */ start = in; /* start -> after fold */ while ((c = *in) != EOS && c != FOLD) in++; if (c == EOS) error("Bug: missing second fold"); *in++ = EOS; /* Terminate right side */ rest = in; while ((c = *in) != EOS && c != FOLD) in++; /* * Output the index */ if (tflag) { if (c == EOS) error("Bug: missing third fold"); else { *in++ = EOS; /* Terminate left side */ printf("%-?s ", offset, in); } } else { if (c != EOS) error("Bug: extra fold"); } /* * Partition the line. At this point: * start -> line after the fold * rest -> line from start to fold * Clear the line and stuff the text into it */ out = &outbuf; while (out < bufend) *out++ = ' '; /* * Copy from "start" to the right half of the output buffer * This algorithm was taken from the Lawerence Livermore * tool kit. */ out = middle; for (in = start; (c = *in) != EOS; in++) { if (in > start && in[-1] == ' ') { if (next(1, start, in, out) >= bufend) out = outbuf; } if (out >= bufend) out = outbuf; *out++ = c; } /* * Copy from the end of the text to the middle (backwards) */ out = middle; for (in = rest; *in != EOS; in++); while (--in >= rest) { out--; if (in[1] == ' ') { if (next(-1, rest, in, out) < outbuf) out = bufend - 1; } if (out < outbuf) out = bufend - 1; *out = *in; } /* * Delete trailing blanks */ for (out = bufend; *--out == ' ' && *out >= outbuf;); out[1] = EOS; printf("%s\n", outbuf); } } char * next(increment, edge, in, out) int increment; /* Which direction (+1 | -1) */ char *edge; /* Lower limit for in */ register char *in; /* From pointer */ register char *out; /* Output pointer */ { register int c; for (; in >= edge; in += increment) { if (*in == ' ' || *in == EOS) break; out += increment; } return(out); }