#
/*
 *			U N I Q . C
 *
 * Read a file, writing unique (or non-unique) lines.
 *
 */

/*)BUILD	$(TKBOPTIONS) = {
			TASK	= ...UNI
		}
*/

#ifdef	DOCUMENTATION

title	uniq	Print Unique Lines in a File
index		Print Unique Lines in a File

synopsis

	uniq [-options] [-fields] [+letters] [ input [output] ]

description

	Uniq reads a sorted input file, writing each unique line.
	The following options are defined:
	.lm +8
	.s.i -8;-u	Only print unique lines.
	.s.i -8;-d	Only print duplicate lines.
	.s.i -8;-c	Print the number of times each line occurred along with
	the line.
	.s.i -8;-N	Skip over the first N words before checking
	for uniqueness.
	.s.i -8;+N	Skip over the first N letters (in the indicated field).
	Note that fields are skipped before letters.
	.s.i -8;N	Compare only N letters.
	.s.lm -8
	A word is defined as "optional spaces or tabs" followed by text up to
	the first space, tab, or end of line.
	.s
	If the output file is not specified, uniq will write to the standard
	output.  If the input file is not specified, uniq will read from the
	standard input.
	.s
	For an on-line help message, execute:
	.s
		uniq ?

diagnostics

	.lm +8
	.s.i -8;Can't open input file "name"
	.s.i -8;Can't open output file "name"
	.lm -8

author

	Martin Minow

bugs

	It would be nice if you could output all instances of
	duplicated lines -- especially if you skip fields.

#endif

char	*documentation[]  = {
"Uniq reads an input file, writing each unique line.",
"Usage:	uniq [-Mode] [-N_fields] [+N_letters] [infile [outfile]]",
"",
"Where:",
"",
" -u	Only print unique lines.",
" -d	Only print duplicate lines.",
" -c	Print the number of times each line occurred along with the line.",
" -z	Print the count (as in -c) with leading zero's.",
" -N	Skip over the first N words before checking for uniqueness",
" +N	Skip over the first N letters (in the indicated field)",
"  N	Compare only N letters after skipping",
"",
"A word is defined as \"optional spaces or tabs\" followed by text up to",
"the first space, tab, or end of line.",
"",
"If no file names are given, input and output are stdin and stdout",
"",
#ifdef vms
"To use on vms (native mode), define \"uniq :== $disk:[account]uniq\",",
"",
#endif
0 };

#include <stdio.h>
#include <ctype.h>
#define	EOS		0
#define	FALSE		0
#define	TRUE		1
#define	BUFSIZE		1024		/* Buffer size (max. line)	*/
int	skip_fields = 0;		/* Number of fields to skip	*/
int	skip_letters = 0;		/* Number of letters to skip	*/
int	check_letters = 0;		/* Number of letters to test	*/
int	linecount;			/* How many repetitions		*/
int	countmode = 0;			/* Counted output		*/
int	mode = 0;			/* Mode byte, if any		*/
int	line1[BUFSIZE];			/* Input buffer 1		*/
int	line2[BUFSIZE];			/* Input buffer 2		*/

FILE	*infd;				/* Input file			*/
FILE	*outfd;				/* Output file			*/

		
main(argc, argv)
int	argc;		/* Number of arguments				*/
char	*argv[];	/* Argument buffer pointer			*/
{
	register char	*argp;		/* Argument pointer		*/
	register char	c;		/* Temp character		*/
	register char	*lp;		/* Line buffer pointer		*/
	char		*getline();
	char		*check();

	infd = stdin;			/* Assume no in/out files	*/
	outfd = stdout;

	if (argc <= 1 || argv[1][0] == '?') {
		help();
		exit();
	}
	while (argc > 1
	  && (c = *(argp = argv[1])) == '-' || c == '+' || isdigit(c)) {
		++argp;
		switch (c) {

		case '+':	skip_letters = atoi(argp);
				break;

		case '-':	if ((c = *argp) >= '0' && c <= '9')
					skip_fields = atoi(argp);
				else {
					c = tolower(c);
					if (c == 'c' || c == 'z') {
						countmode = c;
					}
					else {
						mode = c;
					}
				}
				break;

		default:	check_letters = atoi(&argp[-1]);
				break;
		}
		argc--;
		argv++;
	}
	if (argc > 1) {
		if ((infd = fopen(argv[1], "r")) == NULL) {
			printf("?Can't open input file \"%s\"\n", argv[1]);
			exit(1);
		}
		argc--;
		argv++;
	}
	if (argc > 1) {
		if ((outfd = fopen(argv[1], "w")) == NULL) {
			printf("?Can't open output file \"%s\"\n", argv[1]);
			exit(1);
		}
	}

	/*
	 * Here we go
	 */
	if ((lp = getline(line2)) == 0) {	/* Prime the pump	*/
		fclose(infd);
		fclose(outfd);
		exit();
	}

	for (;;) {
		lp = check(line1, line2, lp);
		lp = check(line2, line1, lp);
	}
}

char *check(new, old, oldpos)
char	*new;		/* New line read here				*/
char	*old;		/* Old line resides here			*/
char	*oldpos;	/* Start of field in old line			*/
/*
 * Read lines as long as new == old.  Return a pointer to the field to
 * test in new.  Exit the program on end of file.
 */
{
	register char	*lp;		/* Random line pointer		*/
	char		*getline();

	linecount = 0;
	for (;;) {
		linecount++;
		if ((lp = getline(new)) == 0) {
			output(old);
			fclose(infd);
			fclose(outfd);
			exit();
		}
		if (!equals(oldpos, lp))
			break;
	}
	output(old);
	return(lp);
}

equals(old, new)
char	*old;		/* Compare this field				*/
char	*new;		/* Against this field				*/
/*
 * Return zero if they don't match.  If they do, return 1.
 */
{
	if (check_letters) {
#ifdef	unix
		return(strcmpn(old, new, check_letters) == 0);
#else
		return(strncmp(old, new, check_letters) == 0);
#endif
	}
	else	return(strcmp(old, new) == 0);
}

output(line)
char	*line;		/* What to output				*/
/*
 * Output this line.
 */
{
	switch (mode) {
	case 'u':	if (linecount > 1)
				return;
			break;

	case 'd':	if (linecount > 1)
				break;
			return;

	}
	if (countmode == 'c')
		fprintf(outfd, "%7d\t", linecount);
	else if (countmode == 'z')
		fprintf(outfd, "%07d\t", linecount);
	fprintf(outfd, "%s", line);
}

char *getline(line)
char	*line;		/* Buffer to read into				*/
/*
 * Read a line. return 0 on end of file.  If not end of file, return
 * a pointer to the first byte of the field to check.
 */
{
	register int	count;
	register char	c;
	register char	*lp;

	if (fgets(line, BUFSIZE, infd) == NULL)
		return(0);
	lp = line;
	for (count = 0; count++ < skip_fields;) {
		while ((c = *lp) == ' ' || c == '\t') lp++;
		while ((c = *lp) != ' ' && c != '\t') {
			if (c == 0)
				return(lp);
			else	lp++;
		}
	}
	for (count = 0; count++ < skip_letters; lp++) {
		if (*lp == 0) break;
	}
	return(lp);
}

help()
/*
 * Give good help
 */
{
	register char	**dp;

	for (dp = documentation; *dp; dp++)
		printf("%s\n", *dp);
}