Head

GCC Code Coverage Report

Directory:	./		Exec	Total	Coverage
File:	usr.bin/spell/spellprog.c	Lines:	0	239	0.0 %
Date:	2017-11-07	Branches:	0	244	0.0 %


/*	$OpenBSD: spellprog.c,v 1.13 2017/07/28 17:16:35 nicm Exp $	*/

/*
 * Copyright (c) 1991, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)spell.h	8.1 (Berkeley) 6/6/93
 */
/*
 * Copyright (C) Caldera International Inc.  2001-2002.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code and documentation must retain the above
 *    copyright notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed or owned by Caldera
 *	International, Inc.
 * 4. Neither the name of Caldera International, Inc. nor the names of other
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
 * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
 * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/mman.h>
#include <sys/stat.h>

#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <locale.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define DLEV 2

int	 an(char *, char *, char *, int);
int	 bility(char *, char *, char *, int);
int	 es(char *, char *, char *, int);
int	 dict(char *, char *);
int	 i_to_y(char *, char *, char *, int);
int	 ily(char *, char *, char *, int);
int	 ize(char *, char *, char *, int);
int	 metry(char *, char *, char *, int);
int	 monosyl(char *, char *);
int	 ncy(char *, char *, char *, int);
int	 nop(char *, char *, char *, int);
int	 trypref(char *, char *, int);
int	 tryword(char *, char *, int);
int	 s(char *, char *, char *, int);
int	 strip(char *, char *, char *, int);
int	 suffix(char *, int);
int	 tion(char *, char *, char *, int);
int	 vowel(unsigned char);
int	 y_to_e(char *, char *, char *, int);
int	 CCe(char *, char *, char *, int);
int	 VCe(char *, char *, char *, int);
char	*lookuppref(char **, char *);
char	*skipv(char *);
char	*estrdup(const char *);
void	 ise(void);
void	 print_word(FILE *);
void	 ztos(char *);
__dead void usage(void);

/* from look.c */
int	 look(unsigned char *, unsigned char *, unsigned char *);

struct suftab {
	char *suf;
	int (*p1)(char *, char *, char *, int);
	int n1;
	char *d1;
	char *a1;
	int (*p2)(char *, char *, char *, int);
	int n2;
	char *d2;
	char *a2;
} suftab[] = {
	{"ssen", ily, 4, "-y+iness", "+ness" },
	{"ssel", ily, 4, "-y+i+less", "+less" },
	{"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
	{"s'", s, 2, "", "+'s"},
	{"s", s, 1, "", "+s"},
	{"ecn", ncy, 1, "", "-t+ce"},
	{"ycn", ncy, 1, "", "-cy+t"},
	{"ytilb", nop, 0, "", ""},
	{"ytilib", bility, 5, "-le+ility", ""},
	{"elbaif", i_to_y, 4, "-y+iable", ""},
	{"elba", CCe, 4, "-e+able", "+able"},
	{"yti", CCe, 3, "-e+ity", "+ity"},
	{"ylb", y_to_e, 1, "-e+y", ""},
	{"yl", ily, 2, "-y+ily", "+ly"},
	{"laci", strip, 2, "", "+al"},
	{"latnem", strip, 2, "", "+al"},
	{"lanoi", strip, 2, "", "+al"},
	{"tnem", strip, 4, "", "+ment"},
	{"gni", CCe, 3, "-e+ing", "+ing"},
	{"reta", nop, 0, "", ""},
	{"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
	{"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
	{"citsi", strip, 2, "", "+ic"},
	{"cihparg", i_to_y, 1, "-y+ic", ""},
	{"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
	{"cirtem", i_to_y, 1, "-y+ic", ""},
	{"yrtem", metry, 0, "-ry+er", ""},
	{"cigol", i_to_y, 1, "-y+ic", ""},
	{"tsigol", i_to_y, 2, "-y+ist", ""},
	{"tsi", VCe, 3, "-e+ist", "+ist"},
	{"msi", VCe, 3, "-e+ism", "+ist"},
	{"noitacif", i_to_y, 6, "-y+ication", ""},
	{"noitazi", ize, 5, "-e+ation", ""},
	{"rota", tion, 2, "-e+or", ""},
	{"noit", tion, 3, "-e+ion", "+ion"},
	{"naino", an, 3, "", "+ian"},
	{"na", an, 1, "", "+n"},
	{"evit", tion, 3, "-e+ive", "+ive"},
	{"ezi", CCe, 3, "-e+ize", "+ize"},
	{"pihs", strip, 4, "", "+ship"},
	{"dooh", ily, 4, "-y+hood", "+hood"},
	{"ekil", strip, 4, "", "+like"},
	{ NULL }
};

char *preftab[] = {
	"anti",
	"bio",
	"dis",
	"electro",
	"en",
	"fore",
	"hyper",
	"intra",
	"inter",
	"iso",
	"kilo",
	"magneto",
	"meta",
	"micro",
	"milli",
	"mis",
	"mono",
	"multi",
	"non",
	"out",
	"over",
	"photo",
	"poly",
	"pre",
	"pseudo",
	"re",
	"semi",
	"stereo",
	"sub",
	"super",
	"thermo",
	"ultra",
	"under",	/* must precede un */
	"un",
	NULL
};

struct wlist {
	int fd;
	unsigned char *front;
	unsigned char *back;
} *wlists;

int vflag;
int xflag;
char word[LINE_MAX];
char original[LINE_MAX];
char *deriv[40];
char affix[40];

/*
 * The spellprog utility accepts a newline-delimited list of words
 * on stdin.  For arguments it expects the path to a word list and
 * the path to a file in which to store found words.
 *
 * In normal usage, spell is called twice.  The first time it is
 * called with a stop list to flag commonly mispelled words.  The
 * remaining words are then passed to spell again, this time with
 * the dictionary file as the first (non-flag) argument.
 *
 * Unlike historic versions of spellprog, this one does not use
 * hashed files.  Instead it simply requires that files be sorted
 * lexigraphically and uses the same algorithm as the look utility.
 *
 * Note that spellprog should be called via the spell shell script
 * and is not meant to be invoked directly by the user.
 */

int
main(int argc, char **argv)
{
	char *ep, *cp, *dp;
	char *outfile;
	int ch, fold, i;
	struct stat sb;
	FILE *file, *found;

	setlocale(LC_ALL, "");

	if (pledge("stdio rpath wpath cpath flock", NULL) == -1)
		err(1, "pledge");

	outfile = NULL;
	while ((ch = getopt(argc, argv, "bvxo:")) != -1) {
		switch (ch) {
		case 'b':
			/* Use British dictionary and convert ize -> ise. */
			ise();
			break;
		case 'o':
			outfile = optarg;
			break;
		case 'v':
			/* Also write derivations to "found" file. */
			vflag = 1;
			break;
		case 'x':
			/* Print plausible stems to stdout. */
			xflag = 1;
			break;
		default:
			usage();
		}

	}
	argc -= optind;
	argv += optind;
	if (argc < 1)
		usage();

	/* Open and mmap the word/stop lists. */
	if ((wlists = calloc(sizeof(struct wlist), (argc + 1))) == NULL)
		err(1, "malloc");
	for (i = 0; argc--; i++) {
		wlists[i].fd = open(argv[i], O_RDONLY, 0);
		if (wlists[i].fd == -1 || fstat(wlists[i].fd, &sb) != 0)
			err(1, "%s", argv[i]);
		if (sb.st_size > SIZE_MAX)
			errc(1, EFBIG, "%s", argv[i]);
		wlists[i].front = mmap(NULL, (size_t)sb.st_size, PROT_READ,
		    MAP_PRIVATE, wlists[i].fd, (off_t)0);
		if (wlists[i].front == MAP_FAILED)
			err(1, "%s", argv[i]);
		wlists[i].back = wlists[i].front + sb.st_size;
	}
	wlists[i].fd = -1;

	/* Open file where found words are to be saved. */
	if (outfile == NULL)
		found = NULL;
	else if ((found = fopen(outfile, "w")) == NULL)
		err(1, "cannot open %s", outfile);

	for (;; print_word(file)) {
		affix[0] = '\0';
		file = found;
		for (ep = word; (*ep = ch = getchar()) != '\n'; ep++) {
			if (ep - word == sizeof(word) - 1) {
				*ep = '\0';
				warnx("word too long (%s)", word);
				while ((ch = getchar()) != '\n')
					;	/* slurp until EOL */
			}
			if (ch == EOF) {
				if (found != NULL)
					fclose(found);
				exit(0);
			}
		}
		for (cp = word, dp = original; cp < ep; )
			*dp++ = *cp++;
		*dp = '\0';
		fold = 0;
		for (cp = word; cp < ep; cp++)
			if (islower((unsigned char)*cp))
				goto lcase;
		if (trypref(ep, ".", 0))
			continue;
		++fold;
		for (cp = original + 1, dp = word + 1; dp < ep; dp++, cp++)
			*dp = tolower((unsigned char)*cp);
lcase:
		if (trypref(ep, ".", 0) || suffix(ep, 0))
			continue;
		if (isupper((unsigned char)word[0])) {
			for (cp = original, dp = word; (*dp = *cp++); dp++) {
				if (fold)
					*dp = tolower((unsigned char)*dp);
			}
			word[0] = tolower((unsigned char)word[0]);
			goto lcase;
		}
		file = stdout;
	}

	exit(0);
}

void
print_word(FILE *f)
{

	if (f != NULL) {
		if (vflag && affix[0] != '\0' && affix[0] != '.')
			fprintf(f, "%s\t%s\n", affix, original);
		else
			fprintf(f, "%s\n", original);
	}
}

/*
 * For each matching suffix in suftab, call the function associated
 * with that suffix (p1 and p2).
 */
int
suffix(char *ep, int lev)
{
	struct suftab *t;
	char *cp, *sp;

	lev += DLEV;
	deriv[lev] = deriv[lev-1] = 0;
	for (t = suftab; (sp = t->suf); t++) {
		cp = ep;
		while (*sp) {
			if (*--cp != *sp++)
				goto next;
		}
		for (sp = cp; --sp >= word && !vowel(*sp);)
			;	/* nothing */
		if (sp < word)
			return (0);
		if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
			return (1);
		if (t->p2 != NULL) {
			deriv[lev] = deriv[lev+1] = 0;
			return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
		}
		return (0);
next:		;
	}
	return (0);
}

int
nop(char *ep, char *d, char *a, int lev)
{

	return (0);
}

int
strip(char *ep, char *d, char *a, int lev)
{

	return (trypref(ep, a, lev) || suffix(ep, lev));
}

int
s(char *ep, char *d, char *a, int lev)
{

	if (lev > DLEV + 1)
		return (0);
	if (*ep == 's' && ep[-1] == 's')
		return (0);
	return (strip(ep, d, a, lev));
}

int
an(char *ep, char *d, char *a, int lev)
{

	if (!isupper((unsigned char)*word))	/* must be proper name */
		return (0);
	return (trypref(ep,a,lev));
}

int
ize(char *ep, char *d, char *a, int lev)
{

	*ep++ = 'e';
	return (strip(ep ,"", d, lev));
}

int
y_to_e(char *ep, char *d, char *a, int lev)
{
	char c = *ep;

	*ep++ = 'e';
	if (strip(ep, "", d, lev))
		return (1);
	ep[-1] = c;
	return (0);
}

int
ily(char *ep, char *d, char *a, int lev)
{

	if (ep[-1] == 'i')
		return (i_to_y(ep, d, a, lev));
	else
		return (strip(ep, d, a, lev));
}

int
ncy(char *ep, char *d, char *a, int lev)
{

	if (skipv(skipv(ep-1)) < word)
		return (0);
	ep[-1] = 't';
	return (strip(ep, d, a, lev));
}

int
bility(char *ep, char *d, char *a, int lev)
{

	*ep++ = 'l';
	return (y_to_e(ep, d, a, lev));
}

int
i_to_y(char *ep, char *d, char *a, int lev)
{

	if (ep[-1] == 'i') {
		ep[-1] = 'y';
		a = d;
	}
	return (strip(ep, "", a, lev));
}

int
es(char *ep, char *d, char *a, int lev)
{

	if (lev > DLEV)
		return (0);

	switch (ep[-1]) {
	default:
		return (0);
	case 'i':
		return (i_to_y(ep, d, a, lev));
	case 's':
	case 'h':
	case 'z':
	case 'x':
		return (strip(ep, d, a, lev));
	}
}

int
metry(char *ep, char *d, char *a, int lev)
{

	ep[-2] = 'e';
	ep[-1] = 'r';
	return (strip(ep, d, a, lev));
}

int
tion(char *ep, char *d, char *a, int lev)
{

	switch (ep[-2]) {
	case 'c':
	case 'r':
		return (trypref(ep, a, lev));
	case 'a':
		return (y_to_e(ep, d, a, lev));
	}
	return (0);
}

/*
 * Possible consonant-consonant-e ending.
 */
int
CCe(char *ep, char *d, char *a, int lev)
{

	switch (ep[-1]) {
	case 'l':
		if (vowel(ep[-2]))
			break;
		switch (ep[-2]) {
		case 'l':
		case 'r':
		case 'w':
			break;
		default:
			return (y_to_e(ep, d, a, lev));
		}
		break;
	case 's':
		if (ep[-2] == 's')
			break;
	case 'c':
	case 'g':
		if (*ep == 'a')
			return (0);
	case 'v':
	case 'z':
		if (vowel(ep[-2]))
			break;
	case 'u':
		if (y_to_e(ep, d, a, lev))
			return (1);
		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
			return (0);
	}
	return (VCe(ep, d, a, lev));
}

/*
 * Possible consonant-vowel-consonant-e ending.
 */
int
VCe(char *ep, char *d, char *a, int lev)
{
	char c;

	c = ep[-1];
	if (c == 'e')
		return (0);
	if (!vowel(c) && vowel(ep[-2])) {
		c = *ep;
		*ep++ = 'e';
		if (trypref(ep, d, lev) || suffix(ep, lev))
			return (1);
		ep--;
		*ep = c;
	}
	return (strip(ep, d, a, lev));
}

char *
lookuppref(char **wp, char *ep)
{
	char **sp;
	char *bp,*cp;

	for (sp = preftab; *sp; sp++) {
		bp = *wp;
		for (cp = *sp; *cp; cp++, bp++) {
			if (tolower((unsigned char)*bp) != *cp)
				goto next;
		}
		for (cp = bp; cp < ep; cp++) {
			if (vowel(*cp)) {
				*wp = bp;
				return (*sp);
			}
		}
next:		;
	}
	return (0);
}

/*
 * If the word is not in the dictionary, try stripping off prefixes
 * until the word is found or we run out of prefixes to check.
 */
int
trypref(char *ep, char *a, int lev)
{
	char *cp;
	char *bp;
	char *pp;
	int val = 0;
	char space[20];

	deriv[lev] = a;
	if (tryword(word, ep, lev))
		return (1);
	bp = word;
	pp = space;
	deriv[lev+1] = pp;
	while ((cp = lookuppref(&bp, ep))) {
		*pp++ = '+';
		while ((*pp = *cp++))
			pp++;
		if (tryword(bp, ep, lev+1)) {
			val = 1;
			break;
		}
		if (pp - space >= sizeof(space))
			return (0);
	}
	deriv[lev+1] = deriv[lev+2] = 0;
	return (val);
}

int
tryword(char *bp, char *ep, int lev)
{
	int i, j;
	char duple[3];

	if (ep-bp <= 1)
		return (0);
	if (vowel(*ep) && monosyl(bp, ep))
		return (0);

	i = dict(bp, ep);
	if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
		ep--;
		deriv[++lev] = duple;
		duple[0] = '+';
		duple[1] = *ep;
		duple[2] = '\0';
		i = dict(bp, ep);
	}
	if (vflag == 0 || i == 0)
		return (i);

	/* Also tack on possible derivations. (XXX - warn on truncation?) */
	for (j = lev; j > 0; j--) {
		if (deriv[j])
			strlcat(affix, deriv[j], sizeof(affix));
	}
	return (i);
}

int
monosyl(char *bp, char *ep)
{

	if (ep < bp + 2)
		return (0);
	if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
		return (0);
	while (--ep >= bp)
		if (vowel(*ep))
			return (0);
	return (1);
}

char *
skipv(char *s)
{

	if (s >= word && vowel(*s))
		s--;
	while (s >= word && !vowel(*s))
		s--;
	return (s);
}

int
vowel(unsigned char c)
{

	switch (tolower(c)) {
	case 'a':
	case 'e':
	case 'i':
	case 'o':
	case 'u':
	case 'y':
		return (1);
	}
	return (0);
}

/*
 * Crummy way to Britishise.
 */
void
ise(void)
{
	struct suftab *tab;

	for (tab = suftab; tab->suf; tab++) {
		/* Assume that suffix will contain 'z' if a1 or d1 do */
		if (strchr(tab->suf, 'z')) {
			tab->suf = estrdup(tab->suf);
			ztos(tab->suf);
			if (strchr(tab->d1, 'z')) {
				tab->d1 = estrdup(tab->d1);
				ztos(tab->d1);
			}
			if (strchr(tab->a1, 'z')) {
				tab->a1 = estrdup(tab->a1);
				ztos(tab->a1);
			}
		}
	}
}

void
ztos(char *s)
{

	for (; *s; s++)
		if (*s == 'z')
			*s = 's';
}

char *
estrdup(const char *s)
{
	char *d;

	if ((d = strdup(s)) == NULL)
		err(1, "strdup");
	return (d);
}

/*
 * Look up a word in the dictionary.
 * Returns 1 if found, 0 if not.
 */
int
dict(char *bp, char *ep)
{
	char c;
	int i, rval;

	c = *ep;
	*ep = '\0';
	if (xflag)
		printf("=%s\n", bp);
	for (i = rval = 0; wlists[i].fd != -1; i++) {
		if ((rval = look((unsigned char *)bp, wlists[i].front,
		    wlists[i].back)) == 1)
			break;
	}
	*ep = c;
	return (rval);
}

__dead void
usage(void)
{
	extern char *__progname;

	fprintf(stderr, "usage: %s [-bvx] [-o found-words] word-list ...\n",
	    __progname);
	exit(1);
}


Generated by: GCOVR (Version 3.3)

Line	Branch	Exec	Source
1			/* $OpenBSD: spellprog.c,v 1.13 2017/07/28 17:16:35 nicm Exp $ */
2
3			/*
4			* Copyright (c) 1991, 1993
5			* The Regents of the University of California. All rights reserved.
6			*
7			* Redistribution and use in source and binary forms, with or without
8			* modification, are permitted provided that the following conditions
9			* are met:
10			* 1. Redistributions of source code must retain the above copyright
11			* notice, this list of conditions and the following disclaimer.
12			* 2. Redistributions in binary form must reproduce the above copyright
13			* notice, this list of conditions and the following disclaimer in the
14			* documentation and/or other materials provided with the distribution.
15			* 3. Neither the name of the University nor the names of its contributors
16			* may be used to endorse or promote products derived from this software
17			* without specific prior written permission.
18			*
19			* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20			* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21			* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22			* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23			* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24			* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25			* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26			* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27			* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28			* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29			* SUCH DAMAGE.
30			*
31			* @(#)spell.h 8.1 (Berkeley) 6/6/93
32			*/
33			/*
34			* Copyright (C) Caldera International Inc. 2001-2002.
35			* All rights reserved.
36			*
37			* Redistribution and use in source and binary forms, with or without
38			* modification, are permitted provided that the following conditions
39			* are met:
40			* 1. Redistributions of source code and documentation must retain the above
41			* copyright notice, this list of conditions and the following disclaimer.
42			* 2. Redistributions in binary form must reproduce the above copyright
43			* notice, this list of conditions and the following disclaimer in the
44			* documentation and/or other materials provided with the distribution.
45			* 3. All advertising materials mentioning features or use of this software
46			* must display the following acknowledgement:
47			* This product includes software developed or owned by Caldera
48			* International, Inc.
49			* 4. Neither the name of Caldera International, Inc. nor the names of other
50			* contributors may be used to endorse or promote products derived from
51			* this software without specific prior written permission.
52			*
53			* USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
54			* INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
55			* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
56			* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
57			* IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
58			* INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
59			* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
60			* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61			* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
62			* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
63			* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
64			* POSSIBILITY OF SUCH DAMAGE.
65			*/
66
67			#include <sys/mman.h>
68			#include <sys/stat.h>
69
70			#include <ctype.h>
71			#include <err.h>
72			#include <errno.h>
73			#include <fcntl.h>
74			#include <limits.h>
75			#include <locale.h>
76			#include <stdint.h>
77			#include <stdio.h>
78			#include <stdlib.h>
79			#include <string.h>
80			#include <unistd.h>
81
82			#define DLEV 2
83
84			int an(char , char , char *, int);
85			int bility(char , char , char *, int);
86			int es(char , char , char *, int);
87			int dict(char , char );
88			int i_to_y(char , char , char *, int);
89			int ily(char , char , char *, int);
90			int ize(char , char , char *, int);
91			int metry(char , char , char *, int);
92			int monosyl(char , char );
93			int ncy(char , char , char *, int);
94			int nop(char , char , char *, int);
95			int trypref(char , char , int);
96			int tryword(char , char , int);
97			int s(char , char , char *, int);
98			int strip(char , char , char *, int);
99			int suffix(char *, int);
100			int tion(char , char , char *, int);
101			int vowel(unsigned char);
102			int y_to_e(char , char , char *, int);
103			int CCe(char , char , char *, int);
104			int VCe(char , char , char *, int);
105			char lookuppref(char , char );
106			char skipv(char );
107			char estrdup(const char );
108			void ise(void);
109			void print_word(FILE *);
110			void ztos(char *);
111			__dead void usage(void);
112
113			/* from look.c */
114			int look(unsigned char , unsigned char , unsigned char *);
115
116			struct suftab {
117			char *suf;
118			int (p1)(char , char , char , int);
119			int n1;
120			char *d1;
121			char *a1;
122			int (p2)(char , char , char , int);
123			int n2;
124			char *d2;
125			char *a2;
126			} suftab[] = {
127			{"ssen", ily, 4, "-y+iness", "+ness" },
128			{"ssel", ily, 4, "-y+i+less", "+less" },
129			{"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
130			{"s'", s, 2, "", "+'s"},
131			{"s", s, 1, "", "+s"},
132			{"ecn", ncy, 1, "", "-t+ce"},
133			{"ycn", ncy, 1, "", "-cy+t"},
134			{"ytilb", nop, 0, "", ""},
135			{"ytilib", bility, 5, "-le+ility", ""},
136			{"elbaif", i_to_y, 4, "-y+iable", ""},
137			{"elba", CCe, 4, "-e+able", "+able"},
138			{"yti", CCe, 3, "-e+ity", "+ity"},
139			{"ylb", y_to_e, 1, "-e+y", ""},
140			{"yl", ily, 2, "-y+ily", "+ly"},
141			{"laci", strip, 2, "", "+al"},
142			{"latnem", strip, 2, "", "+al"},
143			{"lanoi", strip, 2, "", "+al"},
144			{"tnem", strip, 4, "", "+ment"},
145			{"gni", CCe, 3, "-e+ing", "+ing"},
146			{"reta", nop, 0, "", ""},
147			{"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
148			{"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
149			{"citsi", strip, 2, "", "+ic"},
150			{"cihparg", i_to_y, 1, "-y+ic", ""},
151			{"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
152			{"cirtem", i_to_y, 1, "-y+ic", ""},
153			{"yrtem", metry, 0, "-ry+er", ""},
154			{"cigol", i_to_y, 1, "-y+ic", ""},
155			{"tsigol", i_to_y, 2, "-y+ist", ""},
156			{"tsi", VCe, 3, "-e+ist", "+ist"},
157			{"msi", VCe, 3, "-e+ism", "+ist"},
158			{"noitacif", i_to_y, 6, "-y+ication", ""},
159			{"noitazi", ize, 5, "-e+ation", ""},
160			{"rota", tion, 2, "-e+or", ""},
161			{"noit", tion, 3, "-e+ion", "+ion"},
162			{"naino", an, 3, "", "+ian"},
163			{"na", an, 1, "", "+n"},
164			{"evit", tion, 3, "-e+ive", "+ive"},
165			{"ezi", CCe, 3, "-e+ize", "+ize"},
166			{"pihs", strip, 4, "", "+ship"},
167			{"dooh", ily, 4, "-y+hood", "+hood"},
168			{"ekil", strip, 4, "", "+like"},
169			{ NULL }
170			};
171
172			char *preftab[] = {
173			"anti",
174			"bio",
175			"dis",
176			"electro",
177			"en",
178			"fore",
179			"hyper",
180			"intra",
181			"inter",
182			"iso",
183			"kilo",
184			"magneto",
185			"meta",
186			"micro",
187			"milli",
188			"mis",
189			"mono",
190			"multi",
191			"non",
192			"out",
193			"over",
194			"photo",
195			"poly",
196			"pre",
197			"pseudo",
198			"re",
199			"semi",
200			"stereo",
201			"sub",
202			"super",
203			"thermo",
204			"ultra",
205			"under", /* must precede un */
206			"un",
207			NULL
208			};
209
210			struct wlist {
211			int fd;
212			unsigned char *front;
213			unsigned char *back;
214			} *wlists;
215
216			int vflag;
217			int xflag;
218			char word[LINE_MAX];
219			char original[LINE_MAX];
220			char *deriv[40];
221			char affix[40];
222
223			/*
224			* The spellprog utility accepts a newline-delimited list of words
225			* on stdin. For arguments it expects the path to a word list and
226			* the path to a file in which to store found words.
227			*
228			* In normal usage, spell is called twice. The first time it is
229			* called with a stop list to flag commonly mispelled words. The
230			* remaining words are then passed to spell again, this time with
231			* the dictionary file as the first (non-flag) argument.
232			*
233			* Unlike historic versions of spellprog, this one does not use
234			* hashed files. Instead it simply requires that files be sorted
235			* lexigraphically and uses the same algorithm as the look utility.
236			*
237			* Note that spellprog should be called via the spell shell script
238			* and is not meant to be invoked directly by the user.
239			*/
240
241			int
242			main(int argc, char **argv)
243			{
244			char ep, cp, *dp;
245			char *outfile;
246			int ch, fold, i;
247			struct stat sb;
248			FILE file, found;
249
250			setlocale(LC_ALL, "");
251
252			if (pledge("stdio rpath wpath cpath flock", NULL) == -1)
253			err(1, "pledge");
254
255			outfile = NULL;
256			while ((ch = getopt(argc, argv, "bvxo:")) != -1) {
257			switch (ch) {
258			case 'b':
259			/* Use British dictionary and convert ize -> ise. */
260			ise();
261			break;
262			case 'o':
263			outfile = optarg;
264			break;
265			case 'v':
266			/* Also write derivations to "found" file. */
267			vflag = 1;
268			break;
269			case 'x':
270			/* Print plausible stems to stdout. */
271			xflag = 1;
272			break;
273			default:
274			usage();
275			}
276
277			}
278			argc -= optind;
279			argv += optind;
280			if (argc < 1)
281			usage();
282
283			/* Open and mmap the word/stop lists. */
284			if ((wlists = calloc(sizeof(struct wlist), (argc + 1))) == NULL)
285			err(1, "malloc");
286			for (i = 0; argc--; i++) {
287			wlists[i].fd = open(argv[i], O_RDONLY, 0);
288			if (wlists[i].fd == -1 \|\| fstat(wlists[i].fd, &sb) != 0)
289			err(1, "%s", argv[i]);
290			if (sb.st_size > SIZE_MAX)
291			errc(1, EFBIG, "%s", argv[i]);
292			wlists[i].front = mmap(NULL, (size_t)sb.st_size, PROT_READ,
293			MAP_PRIVATE, wlists[i].fd, (off_t)0);
294			if (wlists[i].front == MAP_FAILED)
295			err(1, "%s", argv[i]);
296			wlists[i].back = wlists[i].front + sb.st_size;
297			}
298			wlists[i].fd = -1;
299
300			/* Open file where found words are to be saved. */
301			if (outfile == NULL)
302			found = NULL;
303			else if ((found = fopen(outfile, "w")) == NULL)
304			err(1, "cannot open %s", outfile);
305
306			for (;; print_word(file)) {
307			affix[0] = '\0';
308			file = found;
309			for (ep = word; (*ep = ch = getchar()) != '\n'; ep++) {
310			if (ep - word == sizeof(word) - 1) {
311			*ep = '\0';
312			warnx("word too long (%s)", word);
313			while ((ch = getchar()) != '\n')
314			; /* slurp until EOL */
315			}
316			if (ch == EOF) {
317			if (found != NULL)
318			fclose(found);
319			exit(0);
320			}
321			}
322			for (cp = word, dp = original; cp < ep; )
323			dp++ = cp++;
324			*dp = '\0';
325			fold = 0;
326			for (cp = word; cp < ep; cp++)
327			if (islower((unsigned char)*cp))
328			goto lcase;
329			if (trypref(ep, ".", 0))
330			continue;
331			++fold;
332			for (cp = original + 1, dp = word + 1; dp < ep; dp++, cp++)
333			dp = tolower((unsigned char)cp);
334			lcase:
335			if (trypref(ep, ".", 0) \|\| suffix(ep, 0))
336			continue;
337			if (isupper((unsigned char)word[0])) {
338			for (cp = original, dp = word; (dp = cp++); dp++) {
339			if (fold)
340			dp = tolower((unsigned char)dp);
341			}
342			word[0] = tolower((unsigned char)word[0]);
343			goto lcase;
344			}
345			file = stdout;
346			}
347
348			exit(0);
349			}
350
351			void
352			print_word(FILE *f)
353			{
354
355			if (f != NULL) {
356			if (vflag && affix[0] != '\0' && affix[0] != '.')
357			fprintf(f, "%s\t%s\n", affix, original);
358			else
359			fprintf(f, "%s\n", original);
360			}
361			}
362
363			/*
364			* For each matching suffix in suftab, call the function associated
365			* with that suffix (p1 and p2).
366			*/
367			int
368			suffix(char *ep, int lev)
369			{
370			struct suftab *t;
371			char cp, sp;
372
373			lev += DLEV;
374			deriv[lev] = deriv[lev-1] = 0;
375			for (t = suftab; (sp = t->suf); t++) {
376			cp = ep;
377			while (*sp) {
378			if (--cp != sp++)
379			goto next;
380			}
381			for (sp = cp; --sp >= word && !vowel(*sp);)
382			; /* nothing */
383			if (sp < word)
384			return (0);
385			if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
386			return (1);
387			if (t->p2 != NULL) {
388			deriv[lev] = deriv[lev+1] = 0;
389			return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
390			}
391			return (0);
392			next: ;
393			}
394			return (0);
395			}
396
397			int
398			nop(char ep, char d, char *a, int lev)
399			{
400
401			return (0);
402			}
403
404			int
405			strip(char ep, char d, char *a, int lev)
406			{
407
408			return (trypref(ep, a, lev) \|\| suffix(ep, lev));
409			}
410
411			int
412			s(char ep, char d, char *a, int lev)
413			{
414
415			if (lev > DLEV + 1)
416			return (0);
417			if (*ep == 's' && ep[-1] == 's')
418			return (0);
419			return (strip(ep, d, a, lev));
420			}
421
422			int
423			an(char ep, char d, char *a, int lev)
424			{
425
426			if (!isupper((unsigned char)word)) / must be proper name */
427			return (0);
428			return (trypref(ep,a,lev));
429			}
430
431			int
432			ize(char ep, char d, char *a, int lev)
433			{
434
435			*ep++ = 'e';
436			return (strip(ep ,"", d, lev));
437			}
438
439			int
440			y_to_e(char ep, char d, char *a, int lev)
441			{
442			char c = *ep;
443
444			*ep++ = 'e';
445			if (strip(ep, "", d, lev))
446			return (1);
447			ep[-1] = c;
448			return (0);
449			}
450
451			int
452			ily(char ep, char d, char *a, int lev)
453			{
454
455			if (ep[-1] == 'i')
456			return (i_to_y(ep, d, a, lev));
457			else
458			return (strip(ep, d, a, lev));
459			}
460
461			int
462			ncy(char ep, char d, char *a, int lev)
463			{
464
465			if (skipv(skipv(ep-1)) < word)
466			return (0);
467			ep[-1] = 't';
468			return (strip(ep, d, a, lev));
469			}
470
471			int
472			bility(char ep, char d, char *a, int lev)
473			{
474
475			*ep++ = 'l';
476			return (y_to_e(ep, d, a, lev));
477			}
478
479			int
480			i_to_y(char ep, char d, char *a, int lev)
481			{
482
483			if (ep[-1] == 'i') {
484			ep[-1] = 'y';
485			a = d;
486			}
487			return (strip(ep, "", a, lev));
488			}
489
490			int
491			es(char ep, char d, char *a, int lev)
492			{
493
494			if (lev > DLEV)
495			return (0);
496
497			switch (ep[-1]) {
498			default:
499			return (0);
500			case 'i':
501			return (i_to_y(ep, d, a, lev));
502			case 's':
503			case 'h':
504			case 'z':
505			case 'x':
506			return (strip(ep, d, a, lev));
507			}
508			}
509
510			int
511			metry(char ep, char d, char *a, int lev)
512			{
513
514			ep[-2] = 'e';
515			ep[-1] = 'r';
516			return (strip(ep, d, a, lev));
517			}
518
519			int
520			tion(char ep, char d, char *a, int lev)
521			{
522
523			switch (ep[-2]) {
524			case 'c':
525			case 'r':
526			return (trypref(ep, a, lev));
527			case 'a':
528			return (y_to_e(ep, d, a, lev));
529			}
530			return (0);
531			}
532
533			/*
534			* Possible consonant-consonant-e ending.
535			*/
536			int
537			CCe(char ep, char d, char *a, int lev)
538			{
539
540			switch (ep[-1]) {
541			case 'l':
542			if (vowel(ep[-2]))
543			break;
544			switch (ep[-2]) {
545			case 'l':
546			case 'r':
547			case 'w':
548			break;
549			default:
550			return (y_to_e(ep, d, a, lev));
551			}
552			break;
553			case 's':
554			if (ep[-2] == 's')
555			break;
556			case 'c':
557			case 'g':
558			if (*ep == 'a')
559			return (0);
560			case 'v':
561			case 'z':
562			if (vowel(ep[-2]))
563			break;
564			case 'u':
565			if (y_to_e(ep, d, a, lev))
566			return (1);
567			if (!(ep[-2] == 'n' && ep[-1] == 'g'))
568			return (0);
569			}
570			return (VCe(ep, d, a, lev));
571			}
572
573			/*
574			* Possible consonant-vowel-consonant-e ending.
575			*/
576			int
577			VCe(char ep, char d, char *a, int lev)
578			{
579			char c;
580
581			c = ep[-1];
582			if (c == 'e')
583			return (0);
584			if (!vowel(c) && vowel(ep[-2])) {
585			c = *ep;
586			*ep++ = 'e';
587			if (trypref(ep, d, lev) \|\| suffix(ep, lev))
588			return (1);
589			ep--;
590			*ep = c;
591			}
592			return (strip(ep, d, a, lev));
593			}
594
595			char *
596			lookuppref(char *wp, char ep)
597			{
598			char **sp;
599			char bp,cp;
600
601			for (sp = preftab; *sp; sp++) {
602			bp = *wp;
603			for (cp = sp; cp; cp++, bp++) {
604			if (tolower((unsigned char)bp) != cp)
605			goto next;
606			}
607			for (cp = bp; cp < ep; cp++) {
608			if (vowel(*cp)) {
609			*wp = bp;
610			return (*sp);
611			}
612			}
613			next: ;
614			}
615			return (0);
616			}
617
618			/*
619			* If the word is not in the dictionary, try stripping off prefixes
620			* until the word is found or we run out of prefixes to check.
621			*/
622			int
623			trypref(char ep, char a, int lev)
624			{
625			char *cp;
626			char *bp;
627			char *pp;
628			int val = 0;
629			char space[20];
630
631			deriv[lev] = a;
632			if (tryword(word, ep, lev))
633			return (1);
634			bp = word;
635			pp = space;
636			deriv[lev+1] = pp;
637			while ((cp = lookuppref(&bp, ep))) {
638			*pp++ = '+';
639			while ((pp = cp++))
640			pp++;
641			if (tryword(bp, ep, lev+1)) {
642			val = 1;
643			break;
644			}
645			if (pp - space >= sizeof(space))
646			return (0);
647			}
648			deriv[lev+1] = deriv[lev+2] = 0;
649			return (val);
650			}
651
652			int
653			tryword(char bp, char ep, int lev)
654			{
655			int i, j;
656			char duple[3];
657
658			if (ep-bp <= 1)
659			return (0);
660			if (vowel(*ep) && monosyl(bp, ep))
661			return (0);
662
663			i = dict(bp, ep);
664			if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
665			ep--;
666			deriv[++lev] = duple;
667			duple[0] = '+';
668			duple[1] = *ep;
669			duple[2] = '\0';
670			i = dict(bp, ep);
671			}
672			if (vflag == 0 \|\| i == 0)
673			return (i);
674
675			/* Also tack on possible derivations. (XXX - warn on truncation?) */
676			for (j = lev; j > 0; j--) {
677			if (deriv[j])
678			strlcat(affix, deriv[j], sizeof(affix));
679			}
680			return (i);
681			}
682
683			int
684			monosyl(char bp, char ep)
685			{
686
687			if (ep < bp + 2)
688			return (0);
689			if (vowel(--ep) \|\| !vowel(--ep) \|\| ep[1] == 'x' \|\| ep[1] == 'w')
690			return (0);
691			while (--ep >= bp)
692			if (vowel(*ep))
693			return (0);
694			return (1);
695			}
696
697			char *
698			skipv(char *s)
699			{
700
701			if (s >= word && vowel(*s))
702			s--;
703			while (s >= word && !vowel(*s))
704			s--;
705			return (s);
706			}
707
708			int
709			vowel(unsigned char c)
710			{
711
712			switch (tolower(c)) {
713			case 'a':
714			case 'e':
715			case 'i':
716			case 'o':
717			case 'u':
718			case 'y':
719			return (1);
720			}
721			return (0);
722			}
723
724			/*
725			* Crummy way to Britishise.
726			*/
727			void
728			ise(void)
729			{
730			struct suftab *tab;
731
732			for (tab = suftab; tab->suf; tab++) {
733			/* Assume that suffix will contain 'z' if a1 or d1 do */
734			if (strchr(tab->suf, 'z')) {
735			tab->suf = estrdup(tab->suf);
736			ztos(tab->suf);
737			if (strchr(tab->d1, 'z')) {
738			tab->d1 = estrdup(tab->d1);
739			ztos(tab->d1);
740			}
741			if (strchr(tab->a1, 'z')) {
742			tab->a1 = estrdup(tab->a1);
743			ztos(tab->a1);
744			}
745			}
746			}
747			}
748
749			void
750			ztos(char *s)
751			{
752
753			for (; *s; s++)
754			if (*s == 'z')
755			*s = 's';
756			}
757
758			char *
759			estrdup(const char *s)
760			{
761			char *d;
762
763			if ((d = strdup(s)) == NULL)
764			err(1, "strdup");
765			return (d);
766			}
767
768			/*
769			* Look up a word in the dictionary.
770			* Returns 1 if found, 0 if not.
771			*/
772			int
773			dict(char bp, char ep)
774			{
775			char c;
776			int i, rval;
777
778			c = *ep;
779			*ep = '\0';
780			if (xflag)
781			printf("=%s\n", bp);
782			for (i = rval = 0; wlists[i].fd != -1; i++) {
783			if ((rval = look((unsigned char *)bp, wlists[i].front,
784			wlists[i].back)) == 1)
785			break;
786			}
787			*ep = c;
788			return (rval);
789			}
790
791			__dead void
792			usage(void)
793			{
794			extern char *__progname;
795
796			fprintf(stderr, "usage: %s [-bvx] [-o found-words] word-list ...\n",
797			__progname);
798			exit(1);
799			}