% !TeX TS-program = lualatex
% !TeX encoding = UTF-8
% This is an AI-generated translation into English of the “toktools” package manual
\documentclass[english,a4paper,10pt]{article}
\usepackage[margin=2cm]{geometry}
\usepackage[bottom]{footmisc}
\usepackage[libertine]{newtxmath}
\usepackage{libertineRoman,enumitem,babel,xspace,tokstools,hyperref,array,footnotehyper,multicol}
\makesavenoteenv{tabular}
\RequirePackage[most]{tcolorbox}
\tcbuselibrary{listings}
\usepackage[scaled=0.8]{GoMono}
\def\tktl{\texttt\tktlname\xspace}

\catcode`\_11
\long\def\def_active#1#2{%
	\catcode`#1=13
	\begingroup
		\lccode`\~`#1
		\lowercase{\endgroup\long\def~{#2}}%
}

%%%%%%%%%%%%% la macro \tktlcode %%%%%%%%%%%%%
\def\nolig_list{\do\`\do\<\do\>\do\,\do\'\do\-}
\def\do_noligs#1{%
	\catcode`#1\active
	\begingroup
		\lccode`\~`#1\relax
		\lowercase{\endgroup\def~{\leavevmode\kern0pt \char`#1\relax}}%
}
\def\no_ligs{\let\do\do_noligs \nolig_list}
\newif\ifspecialcode
\specialcodefalse
\def\specialcode{\specialcodetrue}
\def\tktlcode{\expandafter\tktlcode_i\string}
\def\tktlcode_i#1{%
	\begingroup
		\nobreak\smallbreak
		\parindent0pt
		\parskip=0pt
		\leftskip2.5em
		\rightskip0pt
		\nonfrenchspacing
		\no_ligs
		\def\do##1{\catcode`##1=12\relax}\dospecials
		\ifspecialcode\else
			\def_active\<{\begingroup$\langle$\ttfamily\itshape}%
			\def_active\>{$\rangle$\endgroup}%
			\def_active\&{\string}%
			\def_active\_{\tktlsubscript}%
		\fi
		\ttfamily
		\def_active\^^I{\space\space\space\space}%
		\def_active\^^M{\par\noindent}%
		\def_active\ {\space}%
		\catcode`#1=12
	\def\tktlcode_ii##1#1{##1\par\endgroup\specialcodefalse}%
	\tktlcode_ii
}
\catcode`\_8
\def\tktlsubscript#1{$_{#1}$}

\makeatletter
\let\Verb\verb
\begingroup
	\catcode`\<13 \catcode`\>13
	\gdef\verb{\relax\ifmmode\hbox\else\leavevmode\null\fi
		\bgroup
			\verb@eol@error \let\do\@makeother \dospecials
			\verbatim@font\@noligs
			\catcode`\<13 \catcode`\>13 \def<{\begingroup$\langle$\itshape}\def>{\/$\rangle$\endgroup}%
			\@ifstar\@sverb\@verb}
\endgroup
\def\longfrdate@i#1/#2/#3\@nil{\number#3\relax\space \ifcase#2 \or janvier\or février\or mars\or avril\or mai\or juin\or juillet\or aout\or septembre\or octobre\or novembre\or décembre\fi\space#1}
\edef\longfrdate{\expandafter\longfrdate@i\tktldate\@nil}
\def\<#1>{$\langle${\ttfamily\itshape#1\/}$\rangle$}
\makeatother
\newtcblisting{doclatex}{
	boxrule=.5pt,
	boxsep=0pt,
	top=5pt,
	left=5pt,
	right=5pt,
	bottom=5pt,
	skin=bicolor,
	colbacklower = white,
	colback = black!15,
	listing options={%
		style=tcblatex,
		language={TeX},
		showspaces=false,
		showstringspaces=false,
		commentstyle=\itshape\color{gray!80!white},
		basicstyle=\ttfamily\footnotesize
	}
}

\begin{document}
\parindent=0pt
\thispagestyle{empty}
\begin{titlepage}
	\begingroup
		\centering
		\null\vskip.25\vsize
		{\large\bfseries The package for \TeX{} and \LaTeX\par \Huge \tktlname\par}
		\bigbreak
		v \tktlver
		\smallbreak
		\tktldate
		\vskip1.5cm
		Christian {\scshape Tellechea}\par
		\texttt{unbonpetit@netc.fr}\par
	\endgroup
	\vskip2cm
	\leftskip=.2\linewidth \rightskip=.2\linewidth \small
	This extension for \TeX{} or \LaTeX{} provides tools based on Parsing Expression Grammars (PEGs) for manipulating tokens. Given an arbitrary set of tokens with balanced braces, it is possible to define matching rules to test for matches, count them, perform replacements, capture tokens, and more.
\end{titlepage}
\parindent0pt \pagestyle{plain}
\tableofcontents
\parskip\medskipamount
\newpage

\section{Presentation}
The \tktl extension works with any engine. It is not limited to \LaTeX{}, as the \texttt{pegmatch} package unfortunately is, and to use it, you need to write
\begin{itemize}
	\item \verb|\input tokstools.tex| with (pdf)(Xe)(lua)(op)\TeX\ (the syntax \verb|\input{tokstoosl}| is also accepted);
	\item \verb|\usepackage{tokstools}| with (pdf)(Xe)(lua)\LaTeX.
\end{itemize}

This package only requires \href{https://ctan.org/pkg/simplekv}{simplekv}, which will be loaded if it hasn't been.

\textit{Important notice}: this English manual is provided so that as many people as possible can access the necessary instructions for using the \tktl package. It is very important for readers of this manual to understand that it is merely a translation generated by various AI-based tools available online. The quality and accuracy of the translation are therefore not guaranteed, especially since the resulting translation has not been reviewed by a human.  Consequently, \textbf{for reliable information, please refer to the French manual}.
%
%
%
%
%
%
%
%
%
%
%
%
%

\subsection{What’s new in version 0.2}
\paragraph{Macro \texttt{\textbackslash pegreplace}}
The \verb|\pegreplace| macro has a new syntax while offering additional features; see page~\pageref{pegreplace}. Unfortunately, this change breaks the syntax from version 0.1. I sincerely apologize to \tktl users for this inconvenience.

\paragraph{Macro \texttt{\textbackslash printtoks}}
The \verb|\printtoks| macro can now, depending on the value of the new \verb|mode| key, redirect token information to the log file; see page~\pageref{printtoks}.

\paragraph{Validity of assignment directives}
From now on, no validity checks are performed on the “assignment directives” passed via the \verb|assign| key and similar mechanisms. The user must therefore make an informed choice between:
\begin{itemize}
	\item \verb|assign={}|, which is an empty directive to send the result to the \TeX{} input stream for typesetting;
	\item \verb|assign=\def<macro>| so that the result is assigned to the \<macro>;
	\item \verb|assign=<token register>| to assign the result to a token register, which is necessary when it contains tokens of catcode 6 (\verb|#|) to denote arguments.
\end{itemize}

\subsection{Tokens, charcode, catcode, engines}
The \tktl package operates on sets of tokens (or token strings). Tokens are the smallest unit of source code manipulated by \TeX.

Users should be aware that the notion of tokens depends on the type of engine used. It is only for 8~bit engines, such as pdf\LaTeX{} for example, that \hbox{1 token${}={}$1 byte}. As an example, the character “\verb|€|” is seen
\begin{itemize}
	\item by pdf\LaTeX{} as 3 tokens whose charcodes are 226; 130 and 172 and catcodes are all equal to 13;
	\item by lua\LaTeX{} as 1 single token with charcode 8364 and catcode 12.
\end{itemize}

This difference must be taken into account, and can be the source of more or less understandable errors when manipulating UTF8 chars on an 8~bit engine.

This manual was compiled with lua\LaTeX, so tokens with charcodes greater than 255 may exist.

Whatever macros are provided by \tktl, they always decompose their token argument, often called \<tokens>, into a list of tokens, each with a charcode and a catcode. The catcodes visible to \tktl are listed in the table below, along with the most common tokens with charcodes less than 255 that have these catcodes by default.
\begin{center}
	\renewcommand\arraystretch{1.15}
	\begin{tabular}{>{\leftskip0ptplus1fill \rightskip\leftskip}p{1.25cm}p{5.5cm}p{6.5cm}}\hline
		Catcode & Description                        & Tokens\\\hline
		1       & beginning of a  group              & \verb|{|\\
		2       & end of a group                     & \verb|}|\\
		3       & math shift                         & \verb|$|\\
		4       & alignment tab                      & \verb|&|\\
		6       & parameter character                & \verb|#|\\
		7       & superscript                        & \verb|^|\\
		8       & subscript                          & \verb|_|\\
		10      & space                              & \Verb*| | and \itshape HT\\
		11      & letters                            & \verb|a-z| and \verb|A-Z|\\
		12      & others                             & digits (\verb|0-9|)\par
		                                               punctuation signs (\verb|: . ; , ? ! ' "|)\par
		                                               math signs (\Verb|+ - * / = < >|)\par
		                                               other signs (\verb/[ ] ( ) | @ `/)\\
		13      & active character                   & \verb|~|\\
		16      & control sequence                   & \verb|\<letters>| ou \verb|\<char>| \\
		\hline
	\end{tabular}
\end{center}
None of the macros provided by \tktl are expandable. If you wish the results of these macros, or even captures, to be contained in macros that are purely expandable, you must use the assignment instructions defined in this manual.

\subsection{Notations}
What is referred to as a “marker” in this manual is a control sequence (or character) that acts as a reference point to be recognized within an argument, but which is not modified by \tktl and whose meaning is irrelevant.

Throughout this manual:
\begin{itemize}
	\item \<tokens> denotes a set of tokens\footnote{The primitive \texttt{\textbackslash par} may be found in \<tokens>.} balanced in tokens of catcode 1 and 2 (opening and closing braces) that serves as an argument to macros or markers in charge of processing these tokens;
	\item \<1-pattern> represents a single pattern, i.e., consisting of a marker with its possible argument, preceded by an optional predicate and followed by an optional repetition specification;
	\item \<patterns> represents a single \<1-pattern> or a combination of several \<1-pattern> obtained using the concatenation operator «\verb|:|» and  the ordered choice operator «\verb/|/» (only the “\verb/|/” operator for \verb|\toksdo| and \verb|\tokscount|);
	\item \verb|{<patterns>}| is a group that accepts an optional predicate and repetition directive and behaves like a \<1-pattern>.
\end{itemize}

\subsection{Information about tokens using {\ttfamily\textbackslash printtoks}}\label{printtoks}
The macro
\tktlcode|\printtoks[<keys>=<values>]{<tokens>}|
displays tokens with their charcode and catcode below. If a token is a control sequence or a control character, no charcode is displayed.

\begin{doclatex}
\catcode`\!=6 % “!” is similar to “#”
\printtoks[code=\ttfamily]{\def\foo#1!2#3{\hbox to 1cm{\hss$!1^#2_!3$\hss}}}
\medbreak
\printtoks[code=\ttfamily]{eéė€}
\end{doclatex}

The \<keys> available are :
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{2.5cm}p{8cm}}\hline
	\rmfamily Key & \rmfamily Default value & Description\\\hline
	expand arg & 0 & number of expansions for the argument containing the \<tokens> before being taken into account\\
	code & \<empty> & code executed before displaying a token\\
	intertoks & 0.33em & horizontal space between each token, inserted by the primitive \verb|\hskip|\\
	printcharcode & true & display or not the charcode\\
	printcatcode & true & display or not the catcode\\
	hexcharcode & false & display the charcode in base 16 if \verb|true| and in base 10 otherwise\\
	baselinecoeff & 0.8 & vertical spacing coefficient\\
	vlines & true & display or not the vertical lines between each token\\
	boxed & true & put everything in a \verb|\hbox| if \verb|true|, which makes everything unbreakable\\
	mode  & 0    & If 1 or 2, redirect token information to the log file\\\hline
\end{tabular}
\end{center}

When the \verb|mode| key is set to 1, information about each token is also written to the log file. When this option is set to 2, the information is written only to the log file, and nothing is displayed in the PDF.

For example, the following code
\specialcode\tktlcode|\def\foo#1{$#1$}
\catcode`\!=6 % “!” is similar to “#”
\printtoks[mode=2]{\def\foo#1!2#3{\hbox to 1cm{\hss$!1^#2_!3$\hss}}}|
generates the following lines in the log file:

\columnseprule0.4pt
\begin{multicols}2
\specialcode\tktlcode|------ Begin token list ------
  \def catcode 16 (primitive)
  \foo catcode 16 (macro:#1->$#1$)
  # catcode 6  (macro parameter charcater)
  1 catcode 12 (character)
  ! catcode 6  (macro parameter charcater)
  2 catcode 12 (character)
  # catcode 6  (macro parameter charcater)
  3 catcode 12 (character)
  { catcode 1  (begin-group character)
  \hbox catcode 16 (primitive)
  t catcode 11 (letter)
  o catcode 11 (letter)
    catcode 10 (blank space)
  1 catcode 12 (character)
  c catcode 11 (letter)
  m catcode 11 (letter)
  { catcode 1  (begin-group character)
  \hss catcode 16 (primitive)
  $ catcode 3  (math shift character)
  ! catcode 6  (macro parameter charcater)
  1 catcode 12 (character)
  ^ catcode 7  (superscript character)
  # catcode 6  (macro parameter charcater)
  2 catcode 12 (character)
  _ catcode 8  (subscript character)
  ! catcode 6  (macro parameter charcater)
  3 catcode 12 (character)
  $ catcode 3  (math shift character)
  \hss catcode 16 (primitive)
  } catcode 2  (end-group character)
  } catcode 2  (end-group character)
------- End token list -------|
\end{multicols}

\section{Act on each token with \texttt{\textbackslash toksdo}}
This macro scans all tokens one by one, independently of each other. The user can specify criteria using \<patterns> and for each, an \<action> to be performed if the token matches \<patterns>.

\subsection{Elementary patterns}
3 elementary patterns likely to match a token are available, each accessible via the marker \verb|\r|, \verb|\R| and \verb|\S|.

\subsection{The pattern \texttt{\textbackslash r}}\label{motif:r}
The syntax of this \<1-pattern> is
\tktlcode/\r{<csv of token intervals>: <csv of catcode intervals>}/
where an interval is of the form “range extending between two tokens” of type \verb|<token1>-<token2>|, but can be reduced to a single token \verb|<token>|.

In the \<csv list of catcode intervals> can be replaced by the character “\verb|*|”, which acts as a wildcard and stands for any interval. If the \<csv of catcode intervals> is omitted, it is understood as “\verb|*|”.

Commas, hyphens and “\verb|:|” cannot be specified as tokens, as they are part of the syntax (see the \verb|R| pattern to get around this limitation).

For example:\par
\begin{tabular}{>\raggedleft p{7cm}p{9cm}}
	\verb|\r{a-z:11}|                  & matches with a token from \verb|a| to \verb|z| with an 11 catcode.\\
	\verb|\r{a-z}| ou \verb|\r{a-z:*}| & matches with a token from \verb|a| to \verb|z| with any catcode.\\
	\verb|\r{a,e,i,o,u,y}|             & matches with any voyel with any catcode.\\
	\verb|\r{a-z,A-Z,0-9:11,12}|       & matches any alphanumeric character (lowercase, uppercase or numeric), with either an 11 or 12 catcode.
\end{tabular}

It is important to note that \emph{no extra spaces} are allowed in \<csv> syntax:
\begin{itemize}
	\item \Verb*|\r{ a-z }| causes a compilation error: «\texttt{! Improper alphabetic constant}»; you must write \verb|\r{a-z}|;
	\item \Verb*|\r{1, ,2 }| is also incorrect; you must write \Verb*|\r{1, ,2}|.
\end{itemize}

\subsection{The pattern \texttt{\textbackslash R}}\label{motif:R}
The syntax of this \<1-pattern> is
\tktlcode/\r{<csv of charcode intervals>:<csv of catcode intervals>}/

In both csv lists, the “\verb|*|” character is a wildcard and replaces any interval. The charcodes are numbers written in digits (base 10) or in the manner of \TeX, where “\verb|`\:|”, “\verb|`\,|” and “\verb|`\-|” are the integers corresponding respectively to the charcodes of the “\verb|:|”, “\verb|,|” and “\verb|-|” tokens.

For example:\par
\begin{tabular}{>\raggedleft p{6cm}p{9cm}}
	\verb|\R{*:10}| & matches any catcode token 10 (which is a space for \TeX)\\
	\verb|\R{*:16}| & matches any catcode token 16 (which is a character or control sequence for \TeX)\\
	\verb|\R{106-115: 11}| & matches with any token from “\verb|j|” to “\verb|s|” having a catcode of 11\\
	\verb|\R{`\a-`\z}| & is identical to \verb|\R{97-122}| which is equivalent to \verb|\r{a-z}|\\
	\verb|\R{`\,,`\;,`\: :12}| & is identical to \verb|\R{44,59,58:12}| and matches \verb|,| or \verb|;| or \verb|:| with catcode 12\\
	\verb|\R{*:*}| & matches any token
\end{tabular}

\subsection{The pattern \texttt{\textbackslash S}}\label{motif:S}
The syntax of this \<1-pattern> is
\tktlcode/\S{<tokens>}/

A token matches if it appears in \<tokens> passed as an argument to \verb|\S|.

In the tokens “\verb|ab \foo{0 123}c d|”, those that match the pattern “\verb|\S{1{a }\foo}|” are boxed in red\label{encadrerouge}:
\begin{center}
	\fboxsep=1.5pt
	\toksdo{ \S{1{a }\foo} -> \ifnum\selfcatcode=10 % si espace
	                              \addtok{{\color{red}\fbox{\strut\textvisiblespace}}}%
	                          \else
	                              \setcatcode{12}%
	                              \addtok{{\ttfamily\color{red}\fbox{\strut\self}}}%
	                          \fi,
	          \R{*:*}      -> \addtok{\,{\ttfamily\self}\,}
	       }{ab \foo{0 123}c d}
\end{center}

The \<tokens> may contain one or more nonnested occurrences of the marker \verb|\c|\label{marqueur:c:catcode}. The syntax of this marker is
\tktlcode/\c{<catcode>}{<tokens>}/

In the argument of \verb|\S|, the action of \verb|\c| is to assign the \<catcode> passed in the first argument to the \<tokens>. If the \<catcode> is reduced to a single digit, the curly braces are optional. The same applies to the second argument if it contains a single token.

If a token is a \<macro>, the only catcode it can receive is 12, in which case catcode 12 tokens from \verb|\string<macro>| will be created.

The catcodes accepted in the first argument of \verb|\c| are: 1, 2, 3, 4, 6, 7, 8, 10, 11, 12 and 13. The user should be extremely careful about the consequences of changing catcodes to the sensitive categories numbered 1, 2 and 6.

If we write
\tktlcode/\S{12\c{12}\bar3abc\c{11}{4 5.6}7\c78 9}/
then the \<set of tokens> contained in the \verb|\S| argument is:
\begin{center}
\catcode`\_11
	\tktl_catcode_string{12\c{12}\bar3abc\c{11}{4 5.6}7\c78 9}
	\expandafter\tktl_decode_enctoks\expandafter{\the\tktl_string_toks}{\printtoks[intertoks=0.75em]}
\end{center}

\subsection{Patterns}
The \<patterns> listed for \verb|\toksdo| consist of a single basic \<1-pattern> or multiple patterns separated by «\verb-|-», which means «or»:\par
\begin{tabular}{>\raggedleft p{7cm}p{9cm}}
	\verb/\r{a-z:11} | \R{*:12}/                  & matches any token that is a letter from \verb|a| to \verb|z| with catcode 11 or any token with catcode 12 \\
	\verb/\R{*:16} | S{01223456789.}/             & matches any token that is a macro, a digit, or a period\\
	\verb/\r{0-9:12} | \r{a-z,A-Z:11} | \R{*:10}/ & matches any token with the default catcode being a digit, a letter, or a space
\end{tabular}

\subsection{Using \texttt{\textbackslash toksdo}}\label{toksdo}
This macro is used as follows:
\tktlcode/\toksdo[<keys>=<values>]
	{
	 <patterns_1> -&> <code_1>,
	 <patterns_2> -&> <code_2>,
	 etc.
	 <patterns_n> -&> <code_n>
	}{<tokens>}/
You can specify as many \<patterns> and associated \<codes> as you wish.

Each \<code> is an arbitrary code\footnote{If this \<code> contains a comma that is not enclosed in curly braces, you must use the syntax \<patterns>\texttt{->\char`\{ }\<code>\texttt{\char`\}}.} that can modify the \<token> that matched \<patterns>. Within this \<code>, the following macros can be used:
\begin{itemize}
	\item \verb|\tokslen|, which is the total number of tokens;
	\item \verb|\selfindex|, \verb|\selfcharcode|, and \verb|\selfcatcode|, which, for the matched token, represent its index (starting at 1 and ending at \verb|\tokslen|), its charcode, and its catcode. If the token is a macro, \verb|\selfcharcode| is \emph{not} a number and is equal to the macro itself;
	\item \verb|\addtok{<code>}| which specifies how to add the matched token to the internal collector that gathers them all for display or assignment at the end of the process;
	\item \verb|\deltok| deletes the matched token;
	\item \verb|\setcharcode{<arithmetic expression>}| and \verb|\setcatcode{<arithmetic expression>}| change the charcode and catcode of the matched token. It is impossible to change the catcode of a macro, and for all other tokens, the catcodes accepted by \verb|\setcatcode| are 1, 2, 3, 4, 6, 7, 8, 10, 11, 12, and 13. Special care must be taken regarding what happens if a catcode is changed to 1, 2, or 6.
\end{itemize}

The \<keys> accepted by the \verb|\toksdo| macro are:
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{2.5cm}p{8cm}}\hline
		\rmfamily Key & \rmfamily Default value & Description\\\hline
		expand arg    & 0                           & number of expansions for the argument containing the \<tokens> before being processed\\
		collect       & true                        & collects all selected tokens for the purpose of displaying or storing them.
		                                              No collection is performed if this boolean is \verb|false|\\
		assign        & \<empty>                     & assignment instruction : an assignment instruction is any code that may appear before the result enclosed in curly braces and the code that is executed is therefore \verb|<assignment>{<result>}|
                                                      If \<empty>, displays the result. If the value is a \<macro>, this \<macro> must be a token register that will receive the result.
		                                              The value can also be of the form \verb|\def<macro>|, where the \verb|<macro>| will receive the result\\\hline
	\end{tabular}
\end{center}

Here's how to program Rot13:
\begin{doclatex}
\toksdo{
	\r{a-m,A-M} -> \setcharcode{\selfcharcode + 13},
	\r{n-z,N-Z} -> \setcharcode{\selfcharcode - 13}
	}{Two plus two equals four}
\end{doclatex}

In this example, any digit or hyphen is replaced with “\verb|x|”, and any space is replaced with “\verb|_|” (catcode 12):
\begin{doclatex}
\toksdo{
	\r{0-9} | \S{-} -> \setcharcode{`x},
	\R{*:10}        -> \setcharcode{`\_}\setcatcode{12}
	}{Your password is: \textbf{758-457-384}, don't forget it!}
\end{doclatex}

Keep only the first half of the tokens:
\begin{doclatex}
\toksdo{\R{*:*} -> \ifnum\selfindex>\numexpr\tokslen/2\relax \deltok \fi}{123abcd689}
\end{doclatex}

It is sometimes necessary to assign the result to a token register in case the tokens obtained contain the catcode 6 token (\verb|#| by default):
\begin{doclatex}
\newtoks\myresult
\toksdo[assign=\myresult]
	{\S{2} -> \deltok }% remove all "2"
	{\def\foo#1{02#120}}
\the\myresult% execution of \def\foo#1{0#10}
\foo{A}, \foo{xYz}
\end{doclatex}
If we had written “\verb|assign = \def\mymacro|”, the code that would have been executed behind the scenes is
\tktlcode/\def\mymacro{\def\foo#1{0#10}}/
This code is illegal in \TeX{} (error of the type “\verb|Illegal parameter number in definition of \mymacro|”).

The \verb|collect| keyword, when set to \verb|false|, indicates that tokens should not be collected. This makes sense if the code processing the matched tokens does not modify them. Here's how to count vowels using a counter:
\begin{doclatex}
\newcount\voyel \voyel=0
\toksdo[collect=false]{ \S{aeiouy} -> \advance\voyel 1 }{happy texing}<\the\voyel>
\end{doclatex}

The argument of \verb|\setcharcode| and \verb|\setcatcode| is evaluated using the \verb|\numexpr| primitive; therefore, any arithmetic expression accepted by this primitive is valid. Here, all vowels are capitalized:
\begin{doclatex}
\toksdo{ \S{aeiouy} -> \setcharcode{\selfcharcode + `A - `a} }{Two plus two equals four}
\end{doclatex}

Here's how to extract the tokens located in the highest level of nested braces in two steps:
\begin{doclatex}
\newcount\nestcnt \nestcnt=0 % count level of nesting
\newcount\maxnestcnt \maxnestcnt=0 % is the highest level of nesting
\def\mycode{12{34{5{67}8}}9{{ab}c}{{d{e}f}}g}
\toksdo[expand arg=1,collect=false]{
	\R{*:1} -> \advance\nestcnt1 \ifnum\nestcnt>\maxnestcnt \maxnestcnt=\nestcnt \fi,
	\R{*:2} -> \advance\nestcnt-1
	}{\mycode}Max nesting level = \the\maxnestcnt\par
Most nested tokens:
\toksdo[expand arg=1]{
	\R{*:1} -> \advance\nestcnt1  \deltok,% delete "{"
	\R{*:2} -> \advance\nestcnt-1 \deltok,% delete "}"
	\R{*:*} -> \ifnum\nestcnt<\maxnestcnt \deltok \fi% delete everything except at the highest nesting level
	}{\mycode}
\end{doclatex}

\subsection{Using \texttt{\textbackslash toksdo} and \texttt{\textbackslash addtok}}\label{addtok}
When the \verb|collect| key is set to \verb|true|, tokens are collected. Once they have been processed and optionally modified by \verb|\setcharcode| or \verb|\setcatcode|, each is added as-is to the internal collector, which is called upon at the end of the process to display the tokens or assign them if the \verb|assign| key specifies this.

It is possible to modify how tokens are added to the internal collector via the \verb|\addtok<code>}| macro, where \verb|<code>| is an arbitrary code in which \verb|\self|\label{marqueur:self} represents the token itself.

By default, and at the beginning of each code block following “\hbox{\verb|<patterns>|\Verb| -> |}”, the \verb|\addtok| macro is initialized to its default value:
\tktlcode|\addtok{\self}|
which means that each token must be added as-is.

If we write \verb|\addtok{\self\self}|, the matched tokens are duplicated. With \verb|\addtok{\fbox{\self}}|, they are framed using the \verb|\fbox| macro from \LaTeX.

The \verb|\deltok| macro is equivalent to \verb|\addtok{}|.

In most cases, \verb|\self| represents a single token: the one currently being processed. It is \emph{only} when a macro is detokenized (assigning 12 to its catcode) that \verb|\self| represents multiple tokens.

In this example, each macro is de-tokenized and boxed, and all spaces are replaced by “\Verb*| _ |”:
\begin{doclatex}
\fboxsep=2pt
\toksdo{ \R{*:16} -> \setcatcode{12}\addtok{\fbox{\self}},% detokenize and \fbox
         \R{*:10} -> \addtok{ \string_ }
       }{a b\foo12. 3\baz- 9}
\end{doclatex}

Each 1 is doubled, and each 0 is replaced with a space:
\begin{doclatex}
\toksdo{ \S{1} -> \addtok{\self\self},
         \S{0} -> \setcatcode{10}% also possible: \addtok{ }
       }{10110101}
\end{doclatex}

The macros \verb|\selfindex|, \verb|\selfcharcode|, and \verb|\selfcatcode| must \emph{not} be used within the argument of \verb|\addtok|, as they will be added as-is to the internal collector. They will only be expanded at runtime, at which point they will contain the data from the last token. If you wish to perform tests on the index, charcode, or catcode of the token to be added, you must do so outside the argument of \verb|\addtok|. Here is the source code, the result of which can be seen on page~\pageref{encadrerouge}:
\begin{doclatex}
In the tokens “\verb|ab \foo{0 123}c d|”, those that match
the pattern “\verb|\S{1{a }\foo}|” are boxed in red:\par
\hfill
\fboxsep=1.5pt
\toksdo{ \S{1{a }\foo} -> \ifnum\selfcatcode=10 % if space
                              \addtok{{\color{red}\fbox{\strut\textvisiblespace}}}%
                          \else% if not a space
                              \setcatcode{12}% detokenize
                              \addtok{{\ttfamily\color{red}\fbox{\strut\self}}}% frame red
                          \fi,
          \R{*:*}      -> \addtok{\,{\ttfamily\self}\,}% for other token add thin space
       }{ab \foo{0 123}c d}
\hfill\null
\end{doclatex}

\section{Counting tokens with {\ttfamily\textbackslash tokscount}}\label{tokscount}
The macro
\tktlcode/\tokscount[<keys>=<values>]{<patterns>}{<tokens>}/
counts how many tokens match the \<patterns> in the \<tokens>. If no patterns are specified (empty argument), the macro counts all tokens.

The available \<keys> are as follows:
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{2.5cm}p{8cm}}\hline
		\rmfamily Key & \rmfamily Default value & Description \\\hline
		expand arg    & 0                       & number of expansions for the argument containing the \<tokens> before being taken into account\\
		assign        & \<empty>                & assignment directive. If \<empty>, displays the number of tokens that matched. Otherwise, must contain an assignment statement of the type
                                                  \verb|\def<macro>| to assign the result to the \<macro>\\
		assign match  & \<empty>                & assignment directive. If \<empty>, the matched tokens are not collected. Otherwise, must contain an assignment statement
		                                          \verb|\def<macro>| to assign them to a macro or \verb|<macro>| to assign them to a token register\\\hline
	\end{tabular}
\end{center}

The \<patterns> accepted by \verb|\tokscount| are the same as those for \verb|\toksdo|.

\begin{doclatex}
1) \tokscount{ \r{a-z} | \S{10} }{ab1023truc098}\quad
2) \tokscount[assign=\def\cc]{ \R{*:16} }{ab\x123truc\zzz0\yy98}<\cc>\quad% counts macros
3) \tokscount[assign=\def\cc,assign match=\def\xx]{ \S{01} }{1{a01}1{0{b100}}1}<\cc><\xx>\quad% counts 0 and 1
4) \tokscount{ \R{*:12} | \S{\foo\bar\zid}  }{ab\foo c12d*-ef}\quad% counts tokens "letters" or macros \foo\bar\zid
5) \tokscount{}{12{34}5}% counts all tokens
\end{doclatex}

\section{PEG Grammars}
In a set of tokens, some may match patterns that can be organized into a PEG (Parsing Expression Grammar). Whenever a match occurs with a pattern, the tokens that matched are consumed and are no longer available for subsequent patterns (except for predicates; see below). If there is no match, no tokens are consumed.

\subsection{The 5 Pattern Markers}\label{motifs}
The \tktl package provides 5 basic patterns defined by 5 markers for constructing more complex patterns:

\begin{itemize}
	\item the pattern \verb|\r{<csv character ranges>:<csv catcode ranges>}| described on page~\pageref{motif:r};
	\item the pattern \verb|\R{<csv charcode ranges>:<csv catcode ranges>}| described on page~\pageref{motif:R};
	\item the pattern \verb|\S{<tokens>}| described on page~\pageref{motif:S};
	\item the pattern \verb|\s{<tokens>}|\label{motif:s}: any set of tokens consisting exactly of the \<tokens> passed as an argument to \verb|\s| matches this pattern: this is therefore an exact match between strings of tokens.\par Just as with the \verb|\S| marker, it is possible to change the catcodes of certain tokens using the \verb|\c| marker according to the syntax \verb|\c{<catcode>}{<tokens>}|;
	\item the pattern \verb|\.|\label{motif:dot} which matches any token and is equivalent to \verb|\R{*:*}|.
\end{itemize}

\subsection{Repetitions}
Each \<1-pattern> can be followed by a repetition marker if you wish to specify the number of times the match occurs. The repetition markers are:
\begin{itemize}
	\item \verb|^<digit>| or \verb|^{<number>}|: requires that the match be repeated exactly the number of times specified as an argument to \verb|^|;
	\item \verb|^{<min>-<max>}|: specifies that the match must occur between \verb|<min>| and \verb|<max>|. If \verb|<min>| is omitted, it is taken as 0. If \verb|<max>| is omitted, there is no upper limit on the number of repetitions. Both numbers cannot be omitted at the same time.
	\item “\verb|+|”: 1 or more repetitions (equivalent to \verb|^{1-}|);
	\item “\verb|*|”: 0 or more repetitions (equivalent to \verb|^{0-}|);
	\item “\verb|?|”: 0 or 1 repetition (equivalent to \verb|^{0-1}|);
\end{itemize}

If no repetition marker is present after a \<1-pattern>, the directive \verb|^1| is implicitly applied.

It is important to note that \emph{in all cases}, the maximum possible number of repetitions is consumed; this behavior is always “greedy”.

\subsection{Predicates}
Each \<1-pattern> can be preceded by the character “\verb|!|” or “\verb|&|”, which turns it into a predicate:
\begin{itemize}
	\item The predicate \verb|!<1-pattern>| matches if there is no match with the \<1-pattern>
	\item The predicate \verb|&<1-pattern>| matches if there is a match with the \<1-pattern>.
\end{itemize}

The rule in PEG grammars is that \emph{a predicate never consumes tokens}: a predicate can therefore be understood as a lookahead i.e. a test on what follows without changing the position.

\subsection{Pattern concatenation}
The character “\verb|:|” between two \<1-pattern> indicates that the first \emph{and} the second should match. Of course, we are not limited to two patterns; there can be as many as desired.

\begin{tabular}{>\raggedleft p{6cm}p{9cm}}
	\verb|&\r{0-9}^{3-} : \r{0-9}^2|      & if there are at least 3 digits, matches the first 2\\
	\verb|\r{a-z}+ : \R{*:10}? : \S{01}^2| & matches one or more letters followed by an optionnal space followed by 2 binary digits\\
	\verb|\r{1-9}+ : \s{00} : !\.|        & matches a number consisting of at least 1 non-zero digit followed by “\verb|00|”, which must be
                                            the \emph{last tokens}, because the predicate “\verb|!\.|“ means “must not be followed by a token”
\end{tabular}

\subsection{Choosing between patterns}
The character “\verb/|/” between two \<1-patterns> indicates that a match should be made with either the first \emph{or} the second. Obviously, we are not limited to two patterns; there can be as many as desired. If a match occurs for one of the patterns, it is selected, and none of the subsequent patterns are tested: this is therefore an ordered choice. If we write
\tktlcode/\r{0-9}^5 | \S{01}^2/
the second test will never be performed since it is included in the first. It is therefore important to start with the most specific tests and end with the most general ones if the ranges of the patterns overlap or, worse, are nested.

The concatenation operator “\verb|:|” takes precedence over the selection operator “\verb/|/”, so
\tktlcode/<a> : <b> | <c> | <d> : <e>/
is interpreted as \verb|<a> : <b>| or \verb|<c>| or \verb|<d>:<e>|.

\subsection{Pattern grouping}
Any \<1-pattern> or combination of <1-pattern>s denoted by \<patterns> can be enclosed in curly braces to form a new \<1-pattern>, which can, in turn, be preceded by a predicate and followed by a repetition specification according to the syntax described above:
\tktlcode/<predicate>{<pattern combination>}<repetitions>/

\subsection{Spaces}
In pattern syntax, spaces are ignored.

Thus
\tktlcode/\r{0-9} ^ 2 : & \r{a-z}^3 : \r{a-z}^ 2/
is equivalent to
\tktlcode/\r{0-9}^2:&\r{a-z}^3:\r{a-z}^2/

\subsection{Precedences}
The precedences of operators on patterns are (the higher the number, the higher the precedence):
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{1.5cm}p{8.5cm}}\hline
		Operator                     & \rmfamily Precedence & Description\\\hline
		\verb|{<patterns>}|          & 5                    & pattern grouping\\
		\verb|<1-pattern>?|            & 4                    & matches 0 or 1 time\\
		\verb|<1-pattern>*|            & 4                    & matches 0 or more time\\
		\verb|<1-pattern>+|            & 4                    & matches 1 or more times\\
		\verb|<1-pattern>^{n}|         & 4                    & matches \verb|n| times\\
		\verb|<1-pattern>^{a-b}|       & 4                    & matches between \verb|a| and \verb|b| times\\
		\verb/!<1-pattern>/            & 3                    & matches if \<1-pattern> does not match, without consuming tokens\\
		\verb/&<1-pattern>/            & 3                    & matches if \<1-pattern> matches, without consuming tokens\\
		\<1-pattern$_1$>\verb/:/\<1-pattern$_2$>/ & 2                    & matches \<1-pattern$_1$> then \<1-pattern$_2$>\\
		\<1-pattern$_1$>\verb/|/\<1-pattern$_2$>/ & 1                    & matches \<1-pattern$_1$> or \<1-pattern$_2$> (ordered choice)\\\hline
	\end{tabular}
\end{center}

\subsection{Pattern names}
Any pattern can be defined and named so that it can be reused later using a simpler, easier-to-remember syntax\label{defpattern}:
\tktlcode|\defpattern<pattern name>{<patterns>}|
The \<pattern name> \emph{must} be a control sequence. This control sequence is not defined or redefined by \verb|\defpattern|; it is also a marker whose meaning is irrelevant.

\section{Testing for a match with {\ttfamily\textbackslash ifpegmatch}}\label{ifpegmatch}
\subsection{Syntax}
The \verb|\ifpegmatch| macro has the following syntax
\tktlcode/\ifpegmatch[<keys>=<values>]{<patterns>}{<tokens>}{<code if match>}{<code if no match>}/

When all values are set to their defaults, this macro tests whether the \<patterns> match tokens found at the beginning of the \<tokens>.

The available \<keys> are:
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{3cm}p{8cm}}\hline
		\rmfamily Key    & \rmfamily Default value & Description\\\hline
		expand arg       & 0                           & number of expansions the argument containing the \<tokens> must undergo before being considered \\
		mode             & 1                           & match search mode                                                                   \\
		capture name     & \<empty>                     & capture names                                                                                       \\
		assign prematch  & \verb|\def\prematchtoks|    & assignment rule for tokens preceding those that matched                                   \\
		assign match     & \verb|\def\matchtoks|       & assignment rule for tokens that have matched                                                    \\
		assign postmatch & \verb|\def\remaintoks|      & assignment rule for tokens preceding those that have matched                                   \\\hline
	\end{tabular}
\end{center}

In this example, we check whether the \<tokens> begin with two digits, an optional space, and at least one lowercase letter
\begin{doclatex}
\defpattern\okmatch{ \r{0-9:12}^2 : \R{*:10}? : \r{a-z:11}+ }
1) \ifpegmatch{\okmatch}{73 ab:*ij}{T}{F}\quad
2) \ifpegmatch{\okmatch}{45foobar2000}{T}{F}\quad
3) \ifpegmatch{\okmatch}{854tex 8}{T}{F}\quad
4) \ifpegmatch{\okmatch}{1 2 b3c}{T}{F}\quad
\end{doclatex}

Let's build a simple grammar to check whether an argument begin with an arithmetic operation of the form \tktlcode/<relative integer><operation><positive integer>/
\begin{doclatex}
\defpattern\sp{ \R{*:10} } % space
\defpattern\digit{ \r{0-9} }% digit
\defpattern\posint{ \digit+ }% positive integer
\defpattern\int{ \S{+-}? : \posint }% +- integer
\defpattern\op{ \S{+-*/} }% math operation
\defpattern\okmatch{ \sp* : \int : \sp* : \op : \sp* : \posint : \sp* }
1) \ifpegmatch\okmatch{2*3}{T}{F}\quad
2) \ifpegmatch\okmatch{-7 + 1 }{T}{F}\quad
3) \ifpegmatch\okmatch{ a + 9 }{T}{F}\quad
4) \ifpegmatch\okmatch{ -2 / 6 + 3 }{T}{F}\quad
5) \ifpegmatch\okmatch{ +2026- 4068}{T}{F}\quad
6) \ifpegmatch\okmatch{ -3 }{T}{F}\quad
7) \ifpegmatch\okmatch{2a-3b}{T}{F}\par
The primitive \verb|\int| is not redefined: $\int x^2dx$
\end{doclatex}

With the predicate “\verb|: !\.|” added to the end of \verb|\okmatch|, line 4 would have displayed “F” because “\Verb*|+ 3 |” remains after the tokens that matched.

\subsection{Mode}
The \verb|\ifpegmatch| macro can search for matches in several modes, specified via the “\verb|mode|” key:
\begin{itemize}
	\item By default, \verb|mode=1| indicates that the match must occur at the beginning of the \<tokens>;
	\item \verb|mode=0| indicates that all \<tokens> must match; this is the strictest mode;
	\item \verb|mode=2| indicates that the match can occur anywhere within the \<tokens>; this is the least restrictive mode.
\end{itemize}

\subsection{Token Output}
The \verb|\ifpegmatch| macro also returns some information:
\begin{itemize}
	\item by default, the \verb|\matchtoks| macro contains the tokens that matched and is empty if no match was found;
	\item by default, the \verb|\remaintoks| macro contains the tokens remaining after those that matched;
	\item by default, the \verb|\prematchtoks| macro contains the tokens preceding those that matched (can only contain tokens when \verb|mode=2|);
	\item the \verb|\matchposition| macro contains the position of the first token that matched and 0 if no match occurred.
\end{itemize}

\begin{doclatex}
1) \ifpegmatch[mode=0]{ \r{A-Z}^3 }{1ABC6}{T}{F}, match=<\matchtoks>, remain=<\remaintoks>, pos=\matchposition\par
2) \ifpegmatch[mode=1]{ \r{A-Z}^3 }{1ABC6}{T}{F}, match=<\matchtoks>, remain=<\remaintoks>, pos=\matchposition\par
3) \ifpegmatch[mode=2]{ \r{A-Z}^3 }{1ABC6}{T}{F}, match=<\matchtoks>, remain=<\remaintoks>, pos=\matchposition\par
4) \ifpegmatch[mode=0]{ \r{A-Z}^3 }{ZZZ12}{T}{F}, match=<\matchtoks>, remain=<\remaintoks>, pos=\matchposition\par
5) \ifpegmatch[mode=1]{ \r{A-Z}^3 }{ZZZ12}{T}{F}, match=<\matchtoks>, remain=<\remaintoks>, pos=\matchposition\par
6) \ifpegmatch[mode=2]{ \r{A-Z}^3 }{ZZZ12}{T}{F}, match=<\matchtoks>, remain=<\remaintoks>, pos=\matchposition
\end{doclatex}

\subsection{Captures}\label{marqueur:c:capture}
The \verb|\c| token, when placed before a \<1-pattern>, indicates that tokens matching this \<1-pattern> must be captured along with their positions. If a \<1-pattern> is a predicate, no capture is performed. Captures are sorted in the chronological order in which they were made and are returned, in two forms, by \verb|\tokscapture{<index>}| or by \verb|\tokscapture{<name>:<index>}|. The \verb|<name>| is optional and is set with the \verb|name| key, and the \verb|<index>| is the capture’s sequence number.

If \verb|<index>| is 0, all captures are enclosed in curly braces and listed in a CSV file in the following format:
\tktlcode/{<capture_1>},{<capture_2>},...,{<capture_n>}/

Similarly, the macro \verb|\poscapture{<index>}| or \verb|\poscapture{<name>:<index>}| returns the capture positions, with the difference that if \<index> is 0, the positions are placed in a CSV file \emph{without being enclosed in curly braces}:
\tktlcode/<position_1>,<position_2>,...,<position_n>/

If a \<index> exceeds the maximum index, an error message is issued.

If \verb|\c| appears after a \<1-pattern>, only the position is captured.

In this example, two complete captures (tokens+position) and one position capture are performed:
\begin{doclatex}
\ifpegmatch[mode=2]{ \c\r{a-z}+ : \c\r{0-9}^2\c }{12abc666def}{T}{F}\par
toks : <\detokenize\expandafter\expandafter\expandafter{\tokscapture{0}}>\qquad
1=<\tokscapture{1}>, 2=<\tokscapture{2}>\par

pos : <\poscapture{0}>\qquad
1=<\poscapture{1}>, 2=<\poscapture{2}>, 3=<\poscapture{3}>
\end{doclatex}

In this example, we define a grammar that matches a scientific notation of the form $a\times10^{b}$ and captures both numbers $a$ and $b$:
\begin{doclatex}
\defpattern\sp{ \R{*:10} }
\defpattern\sign{ \S{+-} }
\defpattern\digit{ \r{0-9} }
\defpattern\integer{ \digit+ }
\defpattern\decsep{ \S{.,} }
\defpattern\scidec{ \sign? : \r{1-9} : {\decsep : \digit+}?  }
\defpattern\opbr{ \R{*:1} }
\defpattern\clbr{ \R{*:2} }
\defpattern\^{ \R{*:7} }
\defpattern\exponent{ \opbr : \sp? : \c{\sign? : \sp? : \integer} : \sp? : \clbr | \c\digit }
\defpattern\sci{\c\scidec : \sp? : \s{\times10} : \sp? : \^ : \sp? :\exponent}
1) \ifpegmatch\sci{3\times10^5}             {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
2) \ifpegmatch\sci{-2.25\times10^{-3}}      {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
3) \ifpegmatch\sci{-0.75\times10^7 }        {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
4) \ifpegmatch\sci{15\times10^0}            {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
5) \ifpegmatch\sci{1.5\times10^ 1 }         {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
6) \ifpegmatch\sci{-2.75 \times 10 ^ { 11 }}{<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
7) \ifpegmatch\sci{-9.96\times10^{ -2 }}    {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
8) \ifpegmatch\sci{-0\times10 ^0}           {<\tokscapture{1}> <\tokscapture{2}>}{Faux}\par
9) \ifpegmatch\sci{-1\times10^ {- 7 }}      {<\tokscapture{1}> <\tokscapture{2}>}{Faux}
\end{doclatex}

\subsection{Recursive Grammars}
The \tktl package supports recursive grammars, but no optimization is performed, and the way they are handled remains naive. Consequently, certain recursive grammars—particularly those involving “left recursion”—will result in infinite loops.

That said, it is possible to use recursive grammars, but adding captures tends to be somewhat unpredictable because the order of these captures is not obvious and depends on the very definition of the grammar and thus on the resulting tree traversal.

Here is a recursive grammar capturing an arithmetic expression involving the four operations with parentheses:
\begin{doclatex}
\defpattern\num{ \r{0-9}+ }
\defpattern\term{ \num | \s{(} : \expr : \s{)} }
\defpattern\factor{ \term : {\S{*/} : \term }* }
\defpattern\expr{ \factor : {\S{+-} : \factor }* }
1) \ifpegmatch[mode=0]\expr{1+3}{T}{F}\quad
2) \ifpegmatch[mode=0]\expr{1-2*3}{T}{F}\quad
3) \ifpegmatch[mode=0]\expr{3*4+6}{T}{F}\quad
4) \ifpegmatch[mode=0]\expr{3-(1-2*3)}{T}{F}\quad
5) \ifpegmatch[mode=0]\expr{3*4*(1-3*2-(1-3/7)*3)/(1/7+2*3)*3-5}{T}{F}\quad
6) \ifpegmatch[mode=0]\expr{6-9*(2-3)+4/5}{T}{F}
\end{doclatex}

This grammar ensures that the expression begins with a parenthesis and contains balanced parentheses:
\begin{doclatex}
\defpattern\nobrtext{ { !\S{()} : \. }+ }
\defpattern\inparen{ \nobrtext : \inparen* | \s{(} : \inparen* : \s{)} }
\defpattern\expr{ &\s{(} : \inparen : !\. }% predicates -> must start with '(' and end with ')'
1) \ifpegmatch\expr{a(b)c}{T}{F}\quad
2) \ifpegmatch\expr{(a(abc)()d)}{T}{F}\quad
3) \ifpegmatch\expr{(a(bc))df)}{T}{F}\quad
4) \ifpegmatch\expr{((abc)d((e)f)g)}{T}{F}\quad
5) \ifpegmatch\expr{((foo)b(((b)a)r)}{T}{F}
\end{doclatex}

\section{Counting matches with {\ttfamily\textbackslash pegcount}}\label{pegcount}
The macro 
\tktlcode/\pegcount[<keys>=<values>]{<patterns>}{<tokens>}/
counts how many times \<patterns> match in the \<tokens>. Each position is saved, and each match is captured so it can be easily retrieved.

No capture explicitly requested by \verb|\c| is allowed and is ignored.

The available \<keys> are:
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{2.5cm}p{8cm}}\hline
		\rmfamily Key    & \rmfamily Default value & Description\\\hline
		expand arg       & 0                          & number of expansions the argument containing the \<tokens> must undergo before being taken into account.\\
		assign           & \<empty>                   & if empty, displays the number of matches. Otherwise, must contain an assignment statement of the type
                                                        \verb|\def<macro>| to assign the result to the \<macro>\\
		assign positions & \verb|\def\matchposlist|   & assignment statement for the list of positions. If \<empty>, no capture is performed. Otherwise, must
                                                        contain an assignment statement \verb|\def<macro>| to assign the list of positions to a \<macro>\\
		name             & \<empty>                   & is the name of the captures returned by \verb|\tokscapture|\\\hline
	\end{tabular}
\end{center}

Captures are returned, in two expansions, by \verb|\tokscapture{[<name>:]<index>}|. The \verb|<name>| is optional and is set using the \verb|<name>| key, and the \verb|<index>| is the capture index. If \verb|<index>| is 0, all captures are enclosed in curly braces and listed in a CSV file in the following format:
\tktlcode/{<capture_1>},{<capture_2>},...,{<capture_n>}/

\begin{doclatex}
1) \pegcount{ \r{0-9}+ : \r{a-z}+ }{foo25bar}, <\matchposlist>\par% numbers followed by letters
2) \pegcount{ \r{0-9}+ : \r{a-z}+ }{a12bcd,4b,z875bar}, <\matchposlist>, 
"\tokscapture{0}", 1="\tokscapture{1}", 2="\tokscapture{2}", 3="\tokscapture{3}"\par% numbers followed by letters
3) \pegcount{ \s{+} : {\r{a-c}^2 : \r{0-9}}+ : \s{+} }{+ab3+..+bb6ab8ca7+..+aa1bb2+..},
	<\matchposlist>,
	1="\tokscapture{1}", 2="\tokscapture{2}", 3="\tokscapture{3}"
\end{doclatex}

\section{Replacements with {\ttfamily\textbackslash pegreplace}}\label{pegreplace}
The macro
\tktlcode/\pegreplace[<keys>=<values>]
      {
      <patterns_1> -&> <remplacment_1>,
      <patterns_2> -&> <remplacment_2>,
      etc.
      <patterns_n> -&> <remplacment_n>
      }{<tokens>}/
searches for all \<patterns> within the \<tokens> and, for each match, replaces the tokens that matched with the code defined in \<replacement>.

The available \<keys> are:
\begin{center}
	\renewcommand\arraystretch{1.25}
	\begin{tabular}{>\ttfamily p{3.5cm}>\ttfamily p{2.5cm}p{8cm}}\hline
		\rmfamily Key & \rmfamily Default value & Description\\\hline
		expand arg    & 0                       & number of expansions the argument containing the \<tokens> must undergo before being taken into account.\\
		mode          & 2                           & specifies the mode in which \verb|\pegreplace| should search for matches\\
		assign        & \<empty>                & if empty, displays the \<tokens> obtained after performing the replacements. Otherwise, must contain an
                                                  assignment statement to assign the result to a macro using \verb|\def<macro>| or to a token register.\\\hline
	\end{tabular}
\end{center}

It is important to understand that all positions in \<tokens> are considered, from position $1$ to position $L$, where $L$ is the number of tokens. For each position, all \<patterns> are tested in turn to determine whether a match occurs at that position. If a \<pattern$_i$> matches at position $k$, the corresponding \<replacement$_i$> is performed, and \verb|\pegreplace| is ready to move to position $k+1$. The key \verb|mode| must be an integer between 0 and 2 that specifies how the \verb|\pegreplace| macro should behave after the first match has been found and the first replacement made:
\begin{itemize}
	\item \verb|mode=0| indicates that after the first match at position $k$, no other positions will be examined, no further actions will be performed, and the \verb|\pegreplace| macro has finished its work;
	\item \verb|mode=1| indicates that all subsequent positions will be examined, but if a \<pattern$_k$> matches, it is then neutralized and cannot match again; in other words, each \<pattern> can match at most once;
	\item \verb|mode=2|, which is the default value, indicates that all subsequent positions will be examined and that each \<pattern> can match as many times as necessary.
\end{itemize}

The \<replacement> is an arbitrary code that does not contain a comma; it may include “\verb|\0|”\label{maqueur:0}, which means “tokens that matched,” “\verb|\1|” is the first capture made by \verb|\c|, “\verb|\2|” is the second, and so on up to “\verb|\9|.” Note that spaces before and after \<replacement> are removed. If a \<replacement> is likely to contain a comma or if you want to preserve the spaces before or after it, you must enclose the entire expression in curly braces.

For example, to convert each word into a CSV list and add spaces, you would need to write
\begin{doclatex}
\pegreplace{
	\r{a-z,A-Z} : &\. : !\R{*:10} -> {\0, } ,% braces necessary here
	\R{*:10}                      -> \quad
	}{Happy TeXing}
\end{doclatex}

To show how \verb|\pegreplace| behaves depending on the value of the \verb|mode| key, in this example, we enclose any sequence of two lowercase letters and place any number from 1 to 5 between angle brackets:
\begin{doclatex}
\fboxsep=1pt
\pegreplace[mode=0]{
	\r{a-z}^2 -> \fbox{\strut\0},
	\S{12345} -> $\langle\0\rangle$
	}{6foob1baz327z}

\pegreplace[mode=1]{
	\r{a-z}^2 -> \fbox{\strut\0},
	\S{12345} -> $\langle\0\rangle$
	}{6foob1baz327z}

\pegreplace[mode=2]{
	\r{a-z}^2 -> \fbox{\strut\0},
	\S{12345} -> $\langle\0\rangle$
	}{6foob1baz327z}
\end{doclatex}

Framing of \verb|\alpha| and \verb|\beta| with their coefficients in math mode:
\begin{doclatex}
\fboxsep=1pt
\pegreplace{ \r{1-9}* : \S{\alpha\beta} -> \fbox{\strut$\0$} }{$2\alpha-3\beta=-4-\alpha+4\beta$}
\end{doclatex}

In this example, we create a grammar that matches the French standards for postal codes and city names (consisting of words and hyphens).
\begin{doclatex}
\defpattern\sp{ \R{*:10} }
\defpattern\CP{\c\r{0-9}^2 : \sp? : \c\r{0-9}^3 }% \1=2 first digits  \2=3 last digits
\defpattern\upcase{ \r{A-Z,À,É} }
\defpattern\lowcase{ \r{a-z,é,è,à,ê,ô,ç} }
\defpattern\ville{ \upcase : \lowcase+ : { \S{-} : {\upcase | \lowcase} : \lowcase+ }* }
\defpattern\CPville{ \CP : \sp : \c\ville }% capture CP and capture name
1) \pegreplace{\CPville -> CodePostal=\textbf{\1\2} est \fbox{\3} }{Destination 75000 Paris}\par
2) \pegreplace{\CPville -> CodePostal=\textbf{\1\2} est \fbox{\3} }{Destination 64 500 Saint-Jean-de-Luz suite}\par
3) \pegreplace{\CPville -> CodePostal=\textbf{\1\2} est \fbox{\3} }{Destination 38120 Saint-Égrève}
\end{doclatex}

Let's come up with a little riddle involving two-digit numbers that are not zero:
\begin{doclatex}
\defpattern\num{ \c\r{1-9} : \c\r{1-9} : !\r{0-9} }
If \pegreplace{\num -> \1\2 gives \the\numexpr\1*\2+\2\relax }{27, 34 and 43}, what gives 57 ? (Answer in 7.5 million years)
\end{doclatex}

\section{Errors}
Among the many errors that can occur, here are a few...

\subsection{Unbalanced Braces}
If a result or capture consists of tokens with unbalanced braces, a compilation error will occur.

For example, if you use \verb|\toksdo| to remove the closing braces:
\tktlcode/\toksdo{ \R{*:2} -&> \deltok }{foo{\bfseries123}bar}/
the error reported by \tktl is “\texttt{! Unbalanced open-group token, 1 close-group token added}”.

To balance the braces, a closing brace is therefore added to the end of the tokens passed as arguments, and the tokens “\verb|foo{\bfseries123bar}|” are sent to the display.

Similarly, if we remove the opening braces:
\tktlcode/\toksdo{ \R{*:1} -&> \deltok}{foo{\bfseries123}bar}/
the error message is “\texttt{! Unbalanced close-group token ignored}”.

The extra closing brace is therefore ignored, and the tokens “\verb|foo\bfseries123bar|” are sent to the display

\subsection{Engine not suitable for encoding}
Using an 8-bit engine \emph{requires} the use of tokens only, particularly for the \verb|\r| token.

Compiling this code with an 8-bit engine
\tktlcode/\ifpegmatch{ \r{a,e,i,o,u,y,é,à,ê,ù} }{Un été à l'océan}{T}{F}/
causes a compilation error because “\verb|é|” consists of 2 tokens (charcodes 195 and 169, catcodes equal to 13), whereas the syntax of \verb|\r| requires that there be only one. The compilation error reported by \tktl is “\texttt{! Multiple token 'é', '0' inserted}”.

The correct syntax would be
\tktlcode/\ifpegmatch{ \r{a,e,i,o,u,y} | \S{éàêù} }{Un été à l'océan}{T}{F}/

\subsection{Invalid Interval}
Intervals of the form \verb|<a>-<b>|, where \verb|a| and \verb|b| are either tokens or numbers, \emph{must} be in the correct order.

Since the token “ń” comes \emph{after} “ó” in UTF-8, the code
\tktlcode/\ifpegmatch{ \r{ń-ó} }{Some text}{T}{F}/
causes the compilation error ”\texttt{! Unsorted interval in 'ń-ó', 'ó-ń' inserted}”. The range is corrected by \tktl.

\subsection{Invalid Pattern Syntax}
Spaces are ignored, but any pattern syntax that does not conform to what is described starting on page~\pageref{patterns} will cause a compilation error.

For example
\tktlcode/\pegcount{ \r{a-z} | S{10} }{ab1023truc098}/
causes the error ”\texttt{! Found "S" when expecting \string\r, \string\R, \string\s, \string\S\space or \string\.}”.

\subsection{Capture Errors}
If a \<name> has not been defined or if a \<index> is requested outside of those assigned during captures, a compilation error is returned.

\tktlcode/\pegcount{ \r{a-z}^2 }{ab1023truc098}/
creates 3 captures, so requesting
\tktlcode/\tokscapture{4}/
results in the compilation error “\texttt{! Undefined token capture at index "4"}”.

\section{List of macros and markers}
\subsection{List of commands}
Here are the macros available to the user:
\begin{itemize}
	\item \verb|\printtoks|, see page~\pageref{printtoks};
	\item \verb|\toksdo|, \verb|\setcharcode|, \verb|\setcatcode|, \verb|\deltok|, \verb|\addtok|, \verb|\selfcharcode|, \verb|\selfcatcode| and \verb|\selfindex|, see pages~\pageref{toksdo} and following;
	\item \verb|\tokscount|, see page~\pageref{tokscount};
	\item \verb|\defpattern|, see page~\pageref{defpattern};
	\item \verb|\ifpegmatch|, see page~\pageref{ifpegmatch};
	\item \verb|\pegcount|, see page~\pageref{pegcount};
	\item \verb|\pegreplace|, see page~\pageref{pegreplace}.
\end{itemize}
The following macros are modified by the \verb|\toksdo| macro, but are restored to their previous state once \verb|\toksdo| has finished executing:
\verb|\setcharcode|, \verb|\setcatcode|, \verb|\deltok|, \verb|\addtok|, \verb|\selfcharcode|, \verb|\selfcatcode| and \verb|\selfindex|.

\subsection{Liste of markers}
\begin{itemize}[topsep=0pt]
	\item\verb|\r{<csv car>:<csv catcode>}|, see page~\pageref{motif:r};
	\item\verb|\R{<csv charcodes>:<csv catcode>}|, see page~\pageref{motif:R};
	\item\verb|\s{<tokens>}|, see page~\pageref{motif:s};
	\item\verb|\S{<tokens>}|, see page~\pageref{motif:S};
	\item\verb|\.|, see page~\pageref{motif:dot};
	\item\verb|\c| has two syntaxes
	\begin{itemize}[label={}]
		\item \verb|\c{<catcode>}{<tokens>}| when used in the argument of \verb|\s| or \verb|\S|, see page~\pageref{marqueur:c:catcode};
		\item \verb|\c| when placed before or after a \<1-pattern>, see page~\pageref{marqueur:c:capture};
	\end{itemize}
	\item\verb|\self|, see page~\pageref{marqueur:self};
	\item \verb|\0|, \verb|\1| up to \verb|\9|, see page~\pageref{marqueur:0};
	\item any \<macro>, whether it exists or not, passed as the first argument to \verb|\defpattern|.
\end{itemize}

\begin{center}
\parskip0pt
$\star$\par
$\star\quad\star$
\end{center}

This package, version \tktlver{}, is still in the experimental stage, and it is possible that, despite the extensive testing that has been done, it contains many bugs.

In addition, some features or syntax are still likely to be slightly modified.

Anyway, I hope \tktl is useful to you. Feel free to contact me by \href{mailto:unbonpetit@netc.fr}{\texttt{\textbf{email}}} to report any bugs, malfunctions, or suggestions for realistic features. Above all, please do not waste your time posting them on \verb|https://tex.stackexchange.com| or any other site, as there is a very good chance I won't see them.
\end{document}