%top{
#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include "manconfig.h"

/* Flex emits several functions which might reasonably have various
 * attributes applied and many unused macros; none of these are our problem.
 */
#if GNUC_PREREQ(8,0)
#  pragma GCC diagnostic ignored "-Wsuggest-attribute=malloc"
#endif
#pragma GCC diagnostic ignored "-Wsuggest-attribute=pure"
#pragma GCC diagnostic ignored "-Wunused-macros"
}

%{

/*
 * lexgrog.l: extract 'whatis' info from nroff man / formatted cat pages.
 *
 * Copyright (C) 1994, 1995 Graeme W. Wilford. (Wilf.)
 * Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
 *               2011, 2012 Colin Watson.
 *
 * This file is part of man-db.
 *
 * man-db is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * man-db is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with man-db; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Wed Oct 12 18:46:11 BST 1994  Wilf. (G.Wilford@ee.surrey.ac.uk)
 *
 * CJW: Detect grap and vgrind. Understand fill requests. Other improvements
 * in the syntax accepted.
 */

#include <sys/stat.h>
#include <errno.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

#include "error.h"
#include "xalloc.h"

#include "gettext.h"
#define _(String) gettext (String)

#include "encodings.h"
#include "pipeline.h"
#include "sandbox.h"
#include "security.h"
#include "util.h"

#include "decompress.h"
#include "lexgrog.h"
#include "manconv.h"
#include "manconv_client.h"

#define YY_READ_BUF_SIZE	1024
#define MAX_NAME		8192

/* defines the ordered list of filters detected by lexgrog */
enum {
	TBL_FILTER = 0,	/* tbl */
	EQN_FILTER,	/* eqn */
	PIC_FILTER,	/* pic */
	GRAP_FILTER,	/* grap */
	REF_FILTER,	/* refer */
	VGRIND_FILTER,	/* vgrind */
	MAX_FILTERS	/* delimiter */
};

#define ARRAY_SIZE(array) (sizeof (array) / sizeof ((array)[0]))

extern man_sandbox *sandbox;

struct macro {
	const char *name;
	const char *value;
};

static const struct macro glyphs[] = {
	/* It is vital to keep these in strcmp order (sort -t\" -k2)!  They
	 * will be searched using bsearch.
	 * Data from groff_char(7), although I have omitted some that are
	 * particularly unlikely to be used in NAME sections.
	 */
	{ "'A", "Á" },
	{ "'C", "Ć" },
	{ "'E", "É" },
	{ "'I", "Í" },
	{ "'O", "Ó" },
	{ "'U", "Ú" },
	{ "'Y", "Ý" },
	{ "'a", "á" },
	{ "'c", "ć" },
	{ "'e", "é" },
	{ "'i", "í" },
	{ "'o", "ó" },
	{ "'u", "ú" },
	{ "'y", "ý" },
	{ ",C", "Ç" },
	{ ",c", "ç" },
	{ "-D", "Ð" },
	{ ".i", "ı" },
	{ "/L", "Ł" },
	{ "/O", "Ø" },
	{ "/l", "ł" },
	{ "/o", "ø" },
	{ ":A", "Ä" },
	{ ":E", "Ë" },
	{ ":I", "Ï" },
	{ ":O", "Ö" },
	{ ":U", "Ü" },
	{ ":Y", "Ÿ" },
	{ ":a", "ä" },
	{ ":e", "ë" },
	{ ":i", "ï" },
	{ ":o", "ö" },
	{ ":u", "ü" },
	{ ":y", "ÿ" },
	{ "AE", "Æ" },
	{ "Bq", "„" },
	{ "Fc", "»" },
	{ "Fi", "ffi" },
	{ "Fl", "ffl" },
	{ "Fo", "«" },
	{ "IJ", "Ĳ" },
	{ "OE", "Œ" },
	{ "Sd", "ð" },
	{ "TP", "Þ" },
	{ "Tp", "þ" },
	{ "^A", "Â" },
	{ "^E", "Ê" },
	{ "^I", "Î" },
	{ "^O", "Ô" },
	{ "^U", "Û" },
	{ "^a", "â" },
	{ "^e", "ê" },
	{ "^i", "î" },
	{ "^o", "ô" },
	{ "^u", "û" },
	{ "`A", "À" },
	{ "`E", "È" },
	{ "`I", "Ì" },
	{ "`O", "Ò" },
	{ "`U", "Ù" },
	{ "`a", "à" },
	{ "`e", "è" },
	{ "`i", "ì" },
	{ "`o", "ò" },
	{ "`u", "ù" },
	{ "a\"", "˝" },
	{ "a-", "¯" },
	{ "a.", "˙" },
	{ "a^", "^" },
	{ "aa", "´" },
	{ "ab", "˘" },
	{ "ac", "¸" },
	{ "ad", "¨" },
	{ "ae", "æ" },
	{ "ah", "ˇ" },
	{ "ao", "˚" },
	{ "aq", "'" },
	{ "a~", "~" },
	{ "bq", "‚" },
	{ "cq", "’" },
	{ "dq", "\"" },
	{ "em", "—" },
	{ "en", "–" },
	{ "fc", "›" },
	{ "ff", "ff" },
	{ "fi", "fi" },
	{ "fl", "fl" },
	{ "fo", "‹" },
	{ "ga", "`" },
	{ "ha", "^" },
	{ "ho", "˛" },
	{ "hy", "‐" },
	{ "ij", "ĳ" },
	{ "lq", "“" },
	{ "oA", "Å" },
	{ "oa", "å" },
	{ "oe", "œ" },
	{ "oq", "‘" },
	{ "r!", "¡" },
	{ "r?", "¿" },
	{ "rq", "”" },
	{ "ss", "ß" },
	{ "ti", "~" },
	{ "vS", "Š" },
	{ "vZ", "Ž" },
	{ "vs", "š" },
	{ "vz", "ž" },
	{ "~A", "Ã" },
	{ "~N", "Ñ" },
	{ "~O", "Õ" },
	{ "~a", "ã" },
	{ "~n", "ñ" },
	{ "~o", "õ" }
};

static const struct macro perldocs[] = {
	/* It is vital to keep these in strcmp order (sort -t\" -k2)!  They
	 * will be searched using bsearch.
	 * Data from Pod/Man.pm.
	 */
	{ "--", "-" },
	{ "Aq", "'" },
	{ "C'", "'" },
	{ "C+", "C++" },
	{ "C`", "`" },
	{ "L\"", "\"" },
	{ "PI", "π" },
	{ "R\"", "\"" }
};

static void add_str_to_whatis (const char *string, size_t length);
static void add_char_to_whatis (unsigned char c);
static void add_separator_to_whatis (void);
static void add_wordn_to_whatis (const char *string, size_t length);
static void add_word_to_whatis (const char *string);
static void add_glyph_to_whatis (const char *string, size_t length);
static void add_perldoc_to_whatis (const char *string, size_t length);
static void mdoc_text (const char *string);
static void newline_found (void);

static char newname[MAX_NAME];
static char *p_name;
static const char *fname;
static char filters[MAX_FILTERS];

static bool fill_mode;
static bool waiting_for_quote;

static decompress *decomp;

#define YY_INPUT(buf,result,max_size) { \
	size_t size = max_size; \
	const char *block = decompress_read (decomp, &size); \
	if (block && size != 0) { \
		memcpy (buf, block, size); \
		buf[size] = '\0'; \
		result = size; \
	} else \
		result = YY_NULL; \
}
#define YY_NO_INPUT
%}

%option ecs meta-ecs
%option 8bit batch caseful never-interactive
%option nostdinit
%option warn
%option noyywrap nounput

%x MAN_PRENAME
%x MAN_NAME
%x MAN_DESC
%x MAN_DESC_AT
%x MAN_DESC_BSX
%x MAN_DESC_BX
%x MAN_DESC_BX_RELEASE
%x MAN_DESC_DQ
%x MAN_DESC_FX
%x MAN_DESC_NX
%x MAN_DESC_OX
%x CAT_NAME
%x CAT_FILE
%x MAN_FILE
%x CAT_REST
%x MAN_REST
%x FORCE_EXIT

digit		[[:digit:]]
upper		[[:upper:]]
alpha		[[:alpha:]]
blank		[[:blank:]]
blank_eol	[[:blank:]\r\n]
word		[[:alnum:]][^[:blank:]\r\n]*
eol		\r?\n
bol		{eol}+
next		{eol}*
empty		{eol}{blank}*
indent		{eol}{blank}+
dbl_quote	\"
font_change	\\f([[:upper:]1-4]|\({upper}{2})
size_change	\\s[+-]?{digit}
style_change	({font_change}{size_change}?|{size_change}{font_change}?)
typeface	\.(B[IR]?|I[BR]?|R[BI]|S[BM])
sec_request	\.[Ss][HhYySs]
comment		['.]\\{dbl_quote}

 /* Please add to this list if you know how. */
 /* Note that, since flex only supports UTF-8 by accident, character classes
  * including non-ASCII characters must be written out as (a|b|c|d) rather
  * than [abcd].
  */
ar_name		(اﻹسم|الإسم)
 /* ИМЕ also works for mk */
bg_name		И(М|м)(Е|е)
cs_name		(J[Mm](É|é|\\\('[Ee]|E|e)[Nn][Oo]|N(Á|á)[Zz][Ee][Vv])
da_name		N[Aa][Vv][Nn]
de_name		B[Ee][Zz][Ee][Ii][Cc][Hh][Nn][Uu][Nn][Gg]
en_name		N[Aa][Mm][Ee]
eo_name		N[Oo][Mm][Oo]
es_name		N[Oo][Mm][Bb][Rr][Ee]
fa_name		نام
fi_name		N[Ii][Mm][Ii]
fr_name		N[Oo][Mm]
hu_name		N(É|é|\\\('[Ee]|E|e)[Vv]
id_name		N[Aa][Mm][Aa]
 /* NOME also works for gl, pt */
it_name		N[Oo][Mm][Ee]
ja_name		(名|̾)(前|称)
ko_name		(이름|명칭)
latin_name	N[Oo][Mm][Ee][Nn]
lt_name		P[Aa][Vv][Aa][Dd][Ii][Nn][Ii][Mm][Aa][Ss]
nl_name		N[Aa][Aa][Mm]
pl_name		N[Aa][Zz][Ww][Aa]
ro_name		N[Uu][Mm][Ee]
ru_name         (И(М|м)(Я|я)|Н(А|а)(З|з)(В|в)(А|а)(Н|н)(И|и)(Е|е)|Н(А|а)(И|и)(М|м)(Е|е)(Н|н)(О|о)(В|в)(А|а)(Н|н)(И|и)(Е|е))
sk_name		M[Ee][Nn][Oo]
sr_name		(И(М|м)(Е|е)|Н(А|а)(З|з)(И|и)(В|в))
srlatin_name	(I[Mm][Ee]|N[Aa][Zz][Ii][Vv])
sv_name		N[Aa][Mm][Nn]
ta_name		பெய
tr_name		(A[Dd]|(İ|i)S(İ|i)M)
uk_name		Н(А|а)(З|з)(В|в)(А|а)
vi_name		T(Ê|ê)[Nn]
zh_CN_name	名{blank}?(称|字){blank}?.*
zh_TW_name	(名{blank}?(稱|字)|命令名){blank}?.*
name		({ar_name}|{bg_name}|{cs_name}|{da_name}|{de_name}|{en_name}|{eo_name}|{es_name}|{fa_name}|{fi_name}|{fr_name}|{hu_name}|{id_name}|{it_name}|{ja_name}|{ko_name}|{latin_name}|{lt_name}|{nl_name}|{pl_name}|{ro_name}|{ru_name}|{sk_name}|{sr_name}|{srlatin_name}|{sv_name}|{ta_name}|{tr_name}|{uk_name}|{vi_name}|{zh_CN_name}|{zh_TW_name})
name_sec	{dbl_quote}?{style_change}?{name}{style_change}?({blank}*{dbl_quote})?

 /* eptgrv : eqn, pic, tbl, grap, refer, vgrind */
tbl_request	\.TS
eqn_request	\.EQ
pic_request	\.PS
grap_request	\.G1
ref1_request	\.R1
ref2_request	\.\[
vgrind_request	\.vS

%%

 /* begin NAME section processing */
<MAN_FILE>{sec_request}{blank_eol}+{name_sec}{blank}*	BEGIN (MAN_PRENAME);
<CAT_FILE>{empty}{2,}{name}{blank}*{indent}		BEGIN (CAT_NAME);

 /* general text matching */
<MAN_FILE>{
	\.[^Ss\r\n].*				|
	\..{0,3}{dbl_quote}?.{0,4}{dbl_quote}? 	|
	{comment}.*				|
	.|{eol}
}

<CAT_FILE>{
	.{1,9}		|
	[ ]*		|
	{eol}{2,}	|
	.|{eol}
}

<MAN_REST>{
	{bol}{tbl_request}		filters[TBL_FILTER] = 't';
	{bol}{eqn_request}		filters[EQN_FILTER] = 'e';
	{bol}{pic_request}		filters[PIC_FILTER] = 'p';
	{bol}{grap_request}		filters[GRAP_FILTER] = 'g';
	{bol}{ref1_request}		|
	{bol}{ref2_request}		filters[REF_FILTER] = 'r';
	{bol}{vgrind_request}		filters[VGRIND_FILTER] = 'v';
}
<MAN_REST><<EOF>>	{	/* exit */
	*p_name = '\0'; /* terminate the string */
	yyterminate ();
}
<MAN_REST>.+|{eol}

 /* rules to end NAME section processing */
<FORCE_EXIT>.|{eol}	{	/* forced exit */
	*p_name = '\0'; /* terminate the string */
	yyterminate ();
}

<MAN_PRENAME>{bol}{sec_request}{blank}*	|
<MAN_PRENAME><<EOF>>	{	/* no NAME at all */
	*p_name = '\0';
	BEGIN (MAN_REST);
}

 /* need to match whole string so that we beat the following roff catch-all,
    so use yyless to push back the name */
<MAN_PRENAME>{
	{bol}{typeface}{blank}.*	|
	{bol}\.Tn{blank}.*		|
	{bol}\.ft{blank}.*		|
	{bol}\.V[be]{blank}.*		|
	{bol}\.IX{blank}.*		|
	{bol}\.Nm{blank}.*		{
		yyless (0);
		BEGIN (MAN_NAME);
	}
}

 /* Skip over initial roff requests in NAME section. The use of yyless here
    is evil. */
<MAN_PRENAME>{bol}['.].*

<MAN_PRENAME>{empty}{eol}		yyless (1);

<MAN_PRENAME>.|{eol}	{
	yyless (0);
	BEGIN (MAN_NAME);
}

<MAN_NAME,MAN_DESC>{
	{bol}{sec_request}{blank}*	| 	/* Another section */
	{bol}\.X{upper}{blank}+		|	/* special - hpux */
	{bol}\.sp{blank}*		|	/* vertical spacing */
	{bol}\.ig{blank}*		|	/* block comment */
	{bol}\.de[1i]?{blank}*		|	/* macro definition */
	{bol}\.i[ef]{blank}*		|	/* conditional */
	{empty}{bol}.+			|
	<<EOF>>				{	/* terminate the string */
		*p_name = '\0';
		BEGIN (MAN_REST);
	}
}

<CAT_NAME>{
	{bol}S[yYeE]	|
	{eol}{2,}.+	|
	{next}__	{	/* terminate the string */
		*p_name = '\0';
		BEGIN (CAT_REST);
		yyterminate ();
	}
}

 /* ROFF request removal */
<MAN_NAME,MAN_DESC>{
 /* some include quoting; dealing with this is unpleasant */
	{bol}{typeface}{blank}+\"	{
		newline_found ();
		waiting_for_quote = true;
	}

	{bol}{typeface}{blank}+		|	/* type face commands */
	{bol}\.Tn{blank}+		|	/* mdoc trade name */
	{bol}\.ft{blank}.*		|	/* font change */
	{bol}\.V[be]{blank}.*		|	/* pod2man, verbatim mode */
	{bol}\.IX{blank}.*		|	/* .IX line */
	{bol}\.Nm{blank}+		|	/* mdoc name */
	{bol}\.PD{blank}*		|	/* paragraph spacing */
	{bol}\\&			|	/* non-breaking space */
	{next}{comment}.*		{	/* per line comments */
		newline_found ();
	}
}

 /* No-op requests */
<MAN_NAME,MAN_DESC>{
	{bol}\.{blank}*$		newline_found ();
	{bol}\.\.$			newline_found ();
}

 /* Toggle fill mode */
<MAN_NAME,MAN_DESC>{
	{bol}\.nf.*			fill_mode = false;
	{bol}\.fi.*			fill_mode = true;
}

<CAT_NAME>-{eol}{blank_eol}*		/* strip continuations */

 /* convert to DASH */
<MAN_NAME>{
	{next}{blank}*\\\((mi|hy|em|en){blank}*		|
	{next}{blank}*\\\[(mi|hy|em|en)\]{blank}*	|
	{next}{blank_eol}+[-\\]-{blank}*		|
	{next}{blank_eol}*[-\\]-{blank}+		|
	{bol}\.Nd{blank}*			{
		add_separator_to_whatis ();
		BEGIN (MAN_DESC);
	}
}
<CAT_NAME>{next}{blank}+-{1,2}{blank_eol}+	add_separator_to_whatis ();

 /* escape sequences and special characters */
<MAN_NAME,MAN_DESC>{
 	{next}\\[\\e]			add_char_to_whatis ('\\');
 	{next}\\('|\(aa)		add_char_to_whatis ('\'');
 	{next}\\(`|\(ga)		add_char_to_whatis ('`');
	{next}\\(-|\((mi|hy|em|en))	add_char_to_whatis ('-');
	{next}\\\[(mi|hy|em|en)\]	add_char_to_whatis ('-');
	{next}\\\.			add_char_to_whatis ('.');
	{next}((\\[ 0t~])|[ ]|\t)*	add_char_to_whatis (' ');
	{next}\\\((ru|ul)		add_char_to_whatis ('_');
	{next}\\\\t			add_char_to_whatis ('\t');

	{next}\\[|^&!%acdpruz{}\r\n]	/* various useless control chars */
	{next}\\[bhlLvx]{blank}*'[^']+'	/* various inline functions */

	{next}\\\$[1-9]			/* interpolate arg */

	/* roff named glyphs */
	{next}\\\(..|\\\[..\]		add_glyph_to_whatis (yytext + 2, 2);
	/* perldoc strings */
	{next}\\\*\(..|\\\*\[..\]	add_perldoc_to_whatis (yytext + 3, 2);
	{next}\\\*.			add_perldoc_to_whatis (yytext + 2, 1);

	{next}\\["#].* 			/* comment */

	{next}{font_change}		/* font changes */
	{next}\\k{alpha}		/* mark input place in register */

	{next}\\n(\({alpha})?{alpha}	/* interpolate number register */
	{next}\\o\"[^"]+\"		/* overstrike chars */

	{next}{size_change}		/* size changes */
	{next}\\w{blank}*'[^']+'[^ \t]*	/* width of string */

	{next}\\			/* catch all */

	{next}\(\\\|\){blank}*		/* function() in hpux */
}

 /* some people rather ambitiously use non-trivial mdoc macros in NAME
    sections; cope with those that have been seen in the wild, and a few
    more */
<MAN_DESC>{
	{bol}\.At{blank}*		BEGIN (MAN_DESC_AT);
	{bol}\.Bsx{blank}*		BEGIN (MAN_DESC_BSX);
	{bol}\.Bx{blank}*		BEGIN (MAN_DESC_BX);
	{bol}\.Fx{blank}*		BEGIN (MAN_DESC_FX);
	{bol}\.Nx{blank}*		BEGIN (MAN_DESC_NX);
	{bol}\.Ox{blank}*		BEGIN (MAN_DESC_OX);
	{bol}\.Ux{blank}*		add_word_to_whatis ("UNIX");

	{bol}\.Dq{blank}*	{
		add_word_to_whatis ("\"");
		BEGIN (MAN_DESC_DQ);
	}
}

<MAN_DESC_AT>{
	32v{blank}*		mdoc_text ("Version 32V AT&T UNIX");
	v1{blank}*		mdoc_text ("Version 1 AT&T UNIX");
	v2{blank}*		mdoc_text ("Version 2 AT&T UNIX");
	v3{blank}*		mdoc_text ("Version 3 AT&T UNIX");
	v4{blank}*		mdoc_text ("Version 4 AT&T UNIX");
	v5{blank}*		mdoc_text ("Version 5 AT&T UNIX");
	v6{blank}*		mdoc_text ("Version 6 AT&T UNIX");
	v7{blank}*		mdoc_text ("Version 7 AT&T UNIX");
	V{blank}*		mdoc_text ("AT&T System V UNIX");
	V.1{blank}*		mdoc_text ("AT&T System V.1 UNIX");
	V.2{blank}*		mdoc_text ("AT&T System V.2 UNIX");
	V.3{blank}*		mdoc_text ("AT&T System V.3 UNIX");
	V.4{blank}*		mdoc_text ("AT&T System V.4 UNIX");
	.|{eol}		{
		yyless (0);
		mdoc_text ("AT&T UNIX");
	}
}

<MAN_DESC_BSX>{
	{word}		{
		add_word_to_whatis ("BSD/OS");
		add_wordn_to_whatis (yytext, yyleng);
		BEGIN (MAN_DESC);
	}
	.|{eol}		{
		yyless (0);
		mdoc_text ("BSD/OS");
	}
}

<MAN_DESC_BX>{
	-alpha{blank}*		mdoc_text ("BSD (currently in alpha test)");
	-beta{blank}*		mdoc_text ("BSD (currently in beta test)");
	-devel{blank}*		mdoc_text ("BSD (currently under development");
	{word}{blank}*	{
		add_wordn_to_whatis (yytext, yyleng);
		add_str_to_whatis ("BSD", 3);
		BEGIN (MAN_DESC_BX_RELEASE);
	}
	.|{eol}		{
		yyless (0);
		mdoc_text ("BSD");
	}
}

<MAN_DESC_BX_RELEASE>{
	[Rr]eno{blank}*		{
		add_str_to_whatis ("-Reno", 5);
		BEGIN (MAN_DESC);
	}
	[Tt]ahoe{blank}*	{
		add_str_to_whatis ("-Tahoe", 6);
		BEGIN (MAN_DESC);
	}
	[Ll]ite{blank}*		{
		add_str_to_whatis ("-Lite", 5);
		BEGIN (MAN_DESC);
	}
	[Ll]ite2{blank}*	{
		add_str_to_whatis ("-Lite2", 6);
		BEGIN (MAN_DESC);
	}
	.|{eol}			{
		yyless (0);
		BEGIN (MAN_DESC);
	}
}

<MAN_DESC_DQ>.*		{
	add_str_to_whatis (yytext, yyleng);
	add_char_to_whatis ('"');
	BEGIN (MAN_DESC);
}

<MAN_DESC_FX>{
	{word}		{
		add_word_to_whatis ("FreeBSD");
		add_wordn_to_whatis (yytext, yyleng);
		BEGIN (MAN_DESC);
	}
	.|{eol}		{
		yyless (0);
		mdoc_text ("FreeBSD");
	}
}

<MAN_DESC_NX>{
	{word}		{
		add_word_to_whatis ("NetBSD");
		add_wordn_to_whatis (yytext, yyleng);
		BEGIN (MAN_DESC);
	}
	.|{eol}		{
		yyless (0);
		mdoc_text ("NetBSD");
	}
}

<MAN_DESC_OX>{
	{word}		{
		add_word_to_whatis ("OpenBSD");
		add_wordn_to_whatis (yytext, yyleng);
		BEGIN (MAN_DESC);
	}
	.|{eol}		{
		yyless (0);
		mdoc_text ("OpenBSD");
	}
}

 /* collapse spaces, escaped spaces, tabs, newlines to a single space */
<CAT_NAME>{next}((\\[ ])|{blank})*	add_char_to_whatis (' ');

 /* a ROFF break request, a paragraph request, or an indentation change
    usually means we have multiple whatis definitions, provide a separator
    for later processing */
<MAN_NAME,MAN_DESC>{
	{bol}\.br{blank}*		|
	{bol}\.LP{blank}*		|
	{bol}\.PP{blank}*		|
	{bol}\.P{blank}*		|
	{bol}\.IP{blank}.*		|
	{bol}\.HP{blank}.*		|
	{bol}\.RS{blank}.*		|
	{bol}\.RE{blank}.*		{
		add_char_to_whatis ((char) 0x11);
		BEGIN (MAN_NAME);
	}
}

 /* any other roff request we don't recognise terminates definitions */
<MAN_NAME,MAN_DESC>{bol}['.]	{
	*p_name = '\0';
	BEGIN (MAN_REST);
}

 /* pass words as a chunk. speed optimization */
<MAN_NAME,MAN_DESC>[[:alnum:]]*		add_str_to_whatis (yytext, yyleng);

 /* normalise the comma (,) separators */
<CAT_NAME>{blank}*,[ \t\r\n]*		|
<MAN_NAME,MAN_DESC>{blank}*,{blank}*	add_str_to_whatis (", ", 2);

<CAT_NAME,MAN_NAME,MAN_DESC>{bol}.	{
	newline_found ();
	add_char_to_whatis (yytext[yyleng - 1]);
}

<CAT_NAME,MAN_NAME,MAN_DESC>.		add_char_to_whatis (*yytext);

 /* default EOF rule */
<<EOF>>	return 1;

%%

/* print warning and force scanner to terminate */
static void too_big (void)
{
	/* Even though MAX_NAME is a macro expanding to a constant, we
	 * translate it using ngettext anyway because that will make it
	 * easier to change the macro later.
	 */
	error (0, 0,
	       ngettext ("warning: whatis for %s exceeds %d byte, "
			 "truncating.",
			 "warning: whatis for %s exceeds %d bytes, "
			 "truncating.", MAX_NAME),
	       fname, MAX_NAME);

	BEGIN (FORCE_EXIT);
}

/* append a string to newname if enough room */
static void add_str_to_whatis (const char *string, size_t length)
{
	if (p_name - newname + length >= MAX_NAME)
		too_big ();
	else {
		(void) strncpy (p_name, string, length);
		p_name += length;
	}
}

/* append a char to newname if enough room */
static void add_char_to_whatis (unsigned char c)
{
	if (p_name - newname + 1 >= MAX_NAME)
		too_big ();
	else if (waiting_for_quote && c == '"')
		waiting_for_quote = false;
	else
		*p_name++ = c;
}

/* append the " - " separator to newname, trimming the first space if one's
 * already there
 */
static void add_separator_to_whatis (void)
{
	if (p_name != newname && *(p_name - 1) != ' ')
		add_char_to_whatis (' ');
	add_str_to_whatis ("- ", 2);
}

/* append a word to newname if enough room, ensuring only necessary
   surrounding space */
static void add_wordn_to_whatis (const char *string, size_t length)
{
	if (p_name != newname && *(p_name - 1) != ' ')
		add_char_to_whatis (' ');
	while (length && string[length - 1] == ' ')
		--length;
	if (length)
		add_str_to_whatis (string, length);
}

static void add_word_to_whatis (const char *string)
{
	add_wordn_to_whatis (string, strlen (string));
}

struct compare_macro_key {
	const char *string;
	size_t length;
};

static int compare_macro (const void *left, const void *right)
{
	const struct compare_macro_key *key = left;
	const struct macro *value = right;
	int cmp;

	cmp = strncmp (key->string, value->name, key->length);
	if (cmp)
		return cmp;
	/* equal up to key->length, so value->name must be at least size
	 * key->length + 1
	 */
	else if (value->name[key->length])
		return -1;
	else
		return 0;
}

static void add_macro_to_whatis (const struct macro *macros, size_t n_macros,
				 const char *string, size_t length)
{
	struct compare_macro_key key;
	const struct macro *macro;

	key.string = string;
	key.length = length;
	macro = bsearch (&key, macros, n_macros, sizeof (struct macro),
			 compare_macro);
	if (macro)
		add_str_to_whatis (macro->value, strlen (macro->value));
}

static void add_glyph_to_whatis (const char *string, size_t length)
{
	add_macro_to_whatis (glyphs, ARRAY_SIZE (glyphs), string, length);
}

static void add_perldoc_to_whatis (const char *string, size_t length)
{
	add_macro_to_whatis (perldocs, ARRAY_SIZE (perldocs), string, length);
}

static void mdoc_text (const char *string)
{
	add_word_to_whatis (string);
	BEGIN (MAN_DESC);
}

static void newline_found (void)
{
	/* If we are mid p_name and the last added char was not a space,
	 * best add one.
	 */
	if (p_name != newname && *(p_name - 1) != ' ') {
		if (fill_mode)
			add_char_to_whatis (' ');
		else {
			add_char_to_whatis ((char) 0x11);
			BEGIN (MAN_NAME);
		}
	}
	waiting_for_quote = false;
}

int find_name (const char *file, const char *filename, lexgrog *p_lg,
	       const char *encoding)
{
	int ret = 0;
	decompress *d;
	char *page_encoding = NULL;
	bool run_col = p_lg->type == CATPAGE && *PROG_COL != '\0';

	if (strcmp (file, "-") == 0) {
		d = decompress_fdopen (dup (STDIN_FILENO));
	} else {
		struct stat st;
		int decompress_flags;
		char *lang;

		if (stat (file, &st)) {
			error (0, errno, "%s", file);
			return 0;
		}

		if (S_ISDIR (st.st_mode)) {
			error (0, EISDIR, "%s", file);
			return 0;
		}

		drop_effective_privs ();
		decompress_flags = 0;
		/* If we're looking at a cat page, then we need to run col
		 * over it, which doesn't work conveniently with an
		 * in-process decompressor.
		 */
		if (!run_col)
			decompress_flags |= DECOMPRESS_ALLOW_INPROCESS;
		d = decompress_open (file, decompress_flags);
		if (!d) {
			error (0, errno, _("can't open %s"), file);
			regain_effective_privs ();
			return 0;
		}
		regain_effective_privs ();

		if (!encoding) {
			lang = lang_dir (file);
			page_encoding = get_page_encoding (lang);
			free (lang);
		}
	}
	if (!page_encoding && encoding)
		page_encoding = xstrdup (encoding);
	if (page_encoding) {
		if (decompress_is_pipeline (d))
			add_manconv (decompress_get_pipeline (d),
				     page_encoding, "UTF-8");
		else if (manconv_inprocess (d, page_encoding, "UTF-8") != 0)
			/* manconv should already have written to stderr, so
			 * just return zero (i.e. no result).
			 */
			goto out;
	}
	if (run_col) {
		pipecmd *col_cmd;
		col_cmd = pipecmd_new_args
			(PROG_COL, "-b", "-p", "-x", (void *) 0);
		pipecmd_pre_exec (col_cmd, sandbox_load, sandbox_free,
				  sandbox);
		pipeline_command (decompress_get_pipeline (d), col_cmd);
	}
	decompress_start (d);

	ret = find_name_decompressed (d, filename, p_lg);

out:
	free (page_encoding);
	decompress_free (d);
	return ret;
}

int find_name_decompressed (decompress *d, const char *filename, lexgrog *p_lg)
{
	int ret;

	decomp = d;

	fname = filename;
	*(p_name = newname) = '\0';
	memset (filters, '_', sizeof (filters));

	fill_mode = true;
	waiting_for_quote = false;

	if (p_lg->type == CATPAGE)
		BEGIN (CAT_FILE);
	else
		BEGIN (MAN_FILE);

	drop_effective_privs ();

	yyrestart (NULL);
	ret = yylex ();

	regain_effective_privs ();

	decompress_wait (decomp);

	if (ret)
		return 0;
	else {
		char f_tmp[MAX_FILTERS];
		int j, k;

		/* wipe out any leading or trailing spaces */
		if (*newname) {
			for (p_name = strchr (newname, '\0');
			     *(p_name - 1) == ' ';
			     p_name--);
			if (*p_name == ' ')
				*p_name = '\0';
		}
		for (p_name = newname; *p_name == ' '; p_name++);
		p_lg->whatis = xstrdup (p_name);
		memset (f_tmp, '\0', MAX_FILTERS);
		f_tmp[0] = '-';
		for (j = k = 0; j < MAX_FILTERS; j++)
			if (filters[j] != '_')
				f_tmp[k++] = filters[j];
		p_lg->filters = xstrdup (f_tmp);
		return p_name[0];
	}
}