[go: up one dir, main page]

File: alex.xml

package info (click to toggle)
alex 2.3.3-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 560 kB
  • ctags: 70
  • sloc: haskell: 3,134; xml: 1,314; yacc: 235; makefile: 116; ansic: 4
file content (1627 lines) | stat: -rw-r--r-- 59,896 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
   "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">

<book id="alex">

  <bookinfo>
    <date>2003-8-11</date>
    <title>Alex User Guide</title>
    <author>
      <firstname>Chris</firstname>
      <surname>Dornan</surname>
    </author>
    <author>
      <firstname>Isaac</firstname>
      <surname>Jones</surname>
    </author>
    <author>
      <firstname>Simon</firstname>
      <surname>Marlow</surname>
    </author>
    <address><email>ijones@syntaxpolice.org</email></address>
    <!--     <copyright> -->
    <!--       <year>1997-2001</year> -->
    <!--       <holder>Simon Marlow</holder> -->
    <!--     </copyright> -->

    <abstract>
      <para>Alex is a tool for generating lexical analysers in
      Haskell, given a description of the tokens to be recognised in
      the form of regular expressions.  It is similar to the tool
      <quote>lex</quote> or <quote>flex</quote> for C/C++.</para>
    </abstract>
  </bookinfo>

  <!-- Table of contents -->
  <toc></toc>

  <chapter id="about">
    <title>About Alex</title>

    <para>Alex can always be obtained from its <ulink
    url="http://www.haskell.org/alex">home page</ulink>.  The latest
    source code lives in the <literal>fptools</literal> CVS
    repository; instructions on accessing that repository are <ulink
    url="http://www.haskell.org/ghc/docs/latest/html/building/sec-cvs.html">here</ulink>.</para>

    <section id="relnotes-22">
      <title>Release Notes for version 2.2</title>

      <itemizedlist>
        <listitem>
          <para>Cabal 1.2 is now required.</para>
        </listitem>

        <listitem>
          <para>ByteString wrappers: use Alex to lex ByteStrings
          directly.</para>
        </listitem>
      </itemizedlist>

    </section>
    <section id="relnotes-210">

      <title>Release Notes for version 2.1.0</title>

      <itemizedlist>
        <listitem>
          <para>Switch to a Cabal build system: you need a recent
          version of Cabal (1.1.6 or later).  If you have GHC 6.4.2,
          then you need to upgrade Cabal before building Alex.  GHC
          6.6 is fine.</para>
        </listitem>

        <listitem>
          <para>Slight change in the error semantics: the input
          returned on error is before the erroneous character was
          read, not after.  This helps to give better error
          messages.</para>
        </listitem>
      </itemizedlist>
    </section>

    <section id="relnotes-20">
      <title>Release Notes for version 2.0</title>
      
      <para>Alex has changed a <emphasis>lot</emphasis> between
      versions 1.x and 2.0.  The following is supposed to be an
      exhaustive list of the changes:</para>

      <section id="changes-syntax">
	<title>Syntax changes</title>

	<itemizedlist>
	  <listitem>
	    <para>Code blocks are now surrounded by
	    <literal>{...}</literal> rather than
	    <literal>%{...%}</literal>.</para>
	  </listitem>

	  <listitem>
	    <para>Character-set macros now begin with
            &lsquo;<literal>$</literal>&rsquo; instead of
            &lsquo;<literal>^</literal>&rsquo; and have
            multi-character names.</para>
	  </listitem>

	  <listitem>
	    <para>Regular expression macros now begin with
            &lsquo;<literal>@</literal>&rsquo; instead of
            &lsquo;<literal>%</literal>&rsquo; and have
            multi-character names.</para>
	  </listitem>

	  <listitem>
	    <para>Macro definitions are no longer surrounded by
	    <literal>{ ... }</literal>.</para>
	  </listitem>

	  <listitem>
	    <para>Rules are now of the form
<programlisting>&lt;c1,c2,...>  regex   { code }</programlisting>
           where <literal>c1</literal>, <literal>c2</literal> are
           startcodes, and <literal>code</literal> is an arbitrary
           Haskell expression.</para>
	  </listitem>

	  <listitem>
	    <para>Regular expression syntax changes:</para>

	    <itemizedlist>
	      <listitem>
		<para><literal>()</literal> is the empty regular
		expression (used to be
		&lsquo;<literal>$</literal>&rsquo;)</para>
	      </listitem>

	      <listitem>
		<para>set complement can now be expressed as
		<literal>[^sets]</literal> (for similarity with lex
		regular expressions).</para>
	      </listitem>

	      <listitem>
		<para>The <literal>'abc'</literal> form is no longer
		available, use <literal>[abc]</literal>
		instead.</para>
	      </listitem>

	      <listitem>
		<para>&lsquo;<literal>^</literal>&rsquo; and
                &lsquo;<literal>$</literal>&rsquo; have the usual
                meanings: &lsquo;<literal>^</literal>&rsquo; matches
                just after a &lsquo;<literal>\n</literal>&rsquo;, and
                &lsquo;<literal>$</literal>&rsquo; matches just before
                a &lsquo;<literal>\n</literal>&rsquo;.</para>
	      </listitem>

	      <listitem>
		<para>&lsquo;<literal>\n</literal>&rsquo; is now the
		escape character, not
		&lsquo;<literal>^</literal>&rsquo;.</para>
	      </listitem>

	      <listitem>
		<para>The form <literal>"..."</literal> means the same
                as the sequence of characters inside the quotes, the
                difference being that special characters do not need
                to be escaped inside <literal>"..."</literal>.</para>
	      </listitem>
	    </itemizedlist>
	  </listitem>

	  <listitem>
	    <para>Rules can have arbitrary predicates attached to
            them.  This subsumes the previous left-context and
            right-context facilities (although these are still allowed
            as syntactic sugar).</para>
	  </listitem>
	</itemizedlist>
      </section>

      <section id="changes-files">
	<title>Changes in the form of an Alex file</title>

	<itemizedlist>
	  <listitem>
	    <para>Each file can now only define a single grammar.
            This change was made to simplify code generation.
            Multiple grammars can be simulated using startcodes, or
            split into separate modules.</para>
	  </listitem>

	  <listitem>
	    <para>The programmer experience has been simplified, and
            at the same time made more flexible.  See the <xref
            linkend="api"/> for details.</para>
	  </listitem>

	  <listitem>
	    <para>You no longer need to import the
	    <literal>Alex</literal> module.</para>
	  </listitem>
	</itemizedlist>
      </section>

      <section id="changes-usage">
	<title>Usage changes</title>
	
	<para>The command-line syntax is quite different.  See <xref
	linkend="invoking"/>.</para>
      </section>

      <section id="changes-implementation">
	<title>Implementation changes</title>
	
	<itemizedlist>
	  <listitem>
	    <para>A more efficient table representation, coupled with
            standard table-compression techniques, are used to keep
            the size of the generated code down.</para>
	  </listitem>

	  <listitem>
	    <para>When compiling a grammar with GHC, the -g switch
            causes an even faster and smaller grammar to be
            generated.</para>
	  </listitem>

	  <listitem>
	    <para>Startcodes are implemented in a different way: each
            state corresponds to a different initial state in the DFA,
            so the scanner doesn't have to check the startcode when it
            gets to an accept state.  This results in a larger, but
            quicker, scanner.</para>
	  </listitem>
	</itemizedlist>
      </section>
    </section>

    <section id="bug-reports">
      <title>Reporting bugs in Alex</title>

      <para>Please report bugs in Alex to
      <email>simonmar@microsoft.com</email>.  There are no specific
      mailing lists for the discussion of Alex-related matters, but
      such topics should be fine on the <ulink
      url="http://www.haskell.org/mailman/listinfo/haskell">Haskell</ulink>
      and <ulink
      url="http://www.haskell.org/mailman/listinfo/haskell-cafe">Haskell
      Cafe</ulink> mailing lists.</para>
    </section>

    <section id="license">
      <title>License</title>

      <para>Copyright (c) 1995-2003, Chris Dornan and Simon Marlow.
      All rights reserved.</para>

      <para>Redistribution and use in source and binary forms, with or
      without modification, are permitted provided that the following
      conditions are met:</para>

      <itemizedlist>
	<listitem>
	  <para>Redistributions of source code must retain the above
          copyright notice, this list of conditions and the following
          disclaimer.</para>
	</listitem>

	<listitem>
	  <para>Redistributions in binary form must reproduce the
          above copyright notice, this list of conditions and the
          following disclaimer in the documentation and/or other
          materials provided with the distribution.</para>
	</listitem>

	<listitem>
	  <para>Neither the name of the copyright holders, nor the
          names of the contributors may be used to endorse or promote
          products derived from this software without specific prior
          written permission.</para>
	</listitem>
      </itemizedlist>

      <para>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
      CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
      INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
      MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
      DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
      CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
      SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
      LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
      USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
      AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
      LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
      IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
      THE POSSIBILITY OF SUCH DAMAGE.</para>
    </section>
  </chapter>

  <chapter id="introduction">
    <title>Introduction</title>

    <para>Alex is a tool for generating lexical analysers in Haskell,
    given a description of the tokens to be recognised in the form of
    regular expressions.  It is similar to the tools
    lex and flex for C/C++.</para>

    <para>Alex takes a description of tokens based on regular
    expressions and generates a Haskell module containing code for
    scanning text efficiently.  Alex is designed to be familiar to
    exisiting lex users, although it does depart from lex in a number
    of ways.</para>

    <figure id="fig-tokens" float="1"><title>A simple Alex specification.</title>
<programlisting>{
module Main (main) where
}

%wrapper "basic"

$digit = 0-9			-- digits
$alpha = [a-zA-Z]		-- alphabetic characters

tokens :-

  $white+				;
  "--".*				;
  let					{ \s -> Let }
  in					{ \s -> In }
  $digit+				{ \s -> Int (read s) }
  [\=\+\-\*\/\(\)]			{ \s -> Sym (head s) }
  $alpha [$alpha $digit \_ \']*		{ \s -> Var s }

{
-- Each action has type :: String -> Token

-- The token type:
data Token =
	Let 		|
	In  		|
	Sym Char	|
	Var String	|
	Int Int
	deriving (Eq,Show)

main = do
  s &lt;- getContents
  print (alexScanTokens s)
}</programlisting></figure>

    <para>A sample specification is given in <xref
    linkend="fig-tokens"/>.  The first few lines between the
    <literal>{</literal> and <literal>}</literal> provide a code scrap
    (some inlined Haskell code) to be placed directly in the output,
    the scrap at the top of the module is normally used to declare the
    module name for the generated Haskell module, in this case
    <literal>Main</literal>.</para>

    <para>The next line, <literal>%wrapper "basic"</literal> controls
    what kind of support code Alex should produce along with the basic
    scanner.  The <literal>basic</literal> wrapper selects a scanner
    that tokenises a <literal>String</literal> and returns a list of
    tokens.  Wrappers are described fully in <xref
    linkend="api"/>.</para>

    <para>The next two lines define the <literal>$digit</literal> and
    <literal>$alpha</literal> macros for use in the token
    definitions.</para>

    <para>The &lsquo;<literal>tokens :-</literal>&rsquo; line ends the
    macro definitions and starts the definition of the scanner.</para>

    <para>The scanner is specified as a series of token definitions
    where each token specification takes the form of</para>

<programlisting><replaceable>regexp</replaceable>   { <replaceable>code</replaceable> }</programlisting>

    <para>The meaming of a this rule is "if the input matches
    <replaceable>regexp</replaceable>, then return
    <replaceable>code</replaceable>".  The code part along with the
    braces can be replaced by simply
    &lsquo;<literal>;</literal>&rsquo;, meaning that this token should
    be ignored in the input stream.  As you can see, we've used this
    to ignore whitespace in our example.</para>

    <para>Our scanner is set up so that the actions are all functions
    with type <literal>String->Token</literal>.  When the token is
    matched, the portion of the input stream that it matched is passed
    to the appropriate action function as a
    <literal>String</literal>.</para>

    <para>At the bottom of the file we have another code fragment,
    surrounded by braces <literal>{ ... }</literal>.  In this
    fragment, we declare the type of the tokens, and give a
    <literal>main</literal> function that we can use for testing it;
    the <literal>main</literal> function just tokenises the input and
    prints the results to standard output.</para>

    <para>Alex has kindly provided the following function which we can
    use to invoke the scanner:</para>

<programlisting>alexScanTokens :: String -> [Token]</programlisting>

    <para>Alex arranges for the input stream to be tokenised, each of
    the action functions to be passed the appropriate
    <literal>String</literal>, and a list of <literal>Token</literal>s
    returned as the result.  If the input stream is lazy, the output
    stream will also be produced lazily<footnote><para>that is, unless you
    have any patterns that require a long lookahead.</para>
      </footnote>.</para>

    <para>We have demonstrated the simplest form of scanner here,
    which was selected by the <literal>%wrapper "basic"</literal> line
    near the top of the file.  In general, actions do not have to have
    type <literal>String->Token</literal>, and there's no requirement
    for the scanner to return a list of tokens.</para>

    <para>With this specification in the file
    <literal>Tokens.x</literal>, Alex can be used to generate
    <literal>Tokens.hs</literal>:</para>

<screen>$ alex Tokens.x</screen>

    <para>If the module needed to be placed in different file,
    <literal>Main.hs</literal> for example, then the output filename
    can be specified using the <option>-o</option> option:</para>

<screen>$ alex Tokens.x -o Main.hs</screen>

    <para>The resulting module is Haskell&nbsp;98 compatible.  It can also
    be readily used with a <ulink
    url="http://www.haskell.org/happy/">Happy</ulink> parser.</para>
  </chapter>

  <chapter id="syntax">
    <title>Alex Files</title>

    <para>In this section we describe the layout of an Alex lexical
    specification.  We begin with the lexical syntax; elements of the
    lexical syntax are referred to throughout the rest of this
    documentation, so you may need to refer back to the following
    section several times.</para>

    <section id="lexical">
      <title>Lexical syntax</title>
      
      <para>Alex's lexical syntax is given below.  It is written as a
      set of macro definitions using Alex's own syntax.  These macros
      are used in the BNF specification of the syntax later on.</para>

<programlisting>$digit      = [0-9]
$octdig     = [0-7]
$hexdig     = [0-9A-Fa-f]
$special    = [\.\;\,\$\|\*\+\?\#\~\-\{\}\(\)\[\]\^\/]
$graphic    = $printable # $white

@string     = \" ($graphic # \")* \"
@id         = [A-Za-z][A-Za-z'_]*
@smac       = '$' id
@rmac       = '@' id
@char       = ($graphic # $special) | @escape
@escape     = '\\' ($printable | 'x' $hexdig+ | 'o' $octdig+ | $digit+)
@code       = -- curly braces surrounding a Haskell code fragment</programlisting>
    </section>

    <section id="alex-files">
      <title>Syntax of Alex files</title>

      <para>In the following description of the Alex syntax, we use an
      extended form of BNF, where optional phrases are enclosed in
      square brackets (<literal>[ ... ]</literal>), and phrases which
      may be repeated zero or more times are enclosed in braces
      (<literal>{ ... }</literal>).  Literal text is enclosed in
      single quotes.</para>

      <para>An Alex lexical specification is normally placed in a file
      with a <literal>.x</literal> extension.  The overall layout of
      an Alex file is:</para>

<programlisting>alex := [ @code ] [ wrapper ] { macrodef } @id ':-' { rule } [ @code ]</programlisting>

      <para>The file begins and ends with optional code fragments.
      These code fragments are copied verbatim into the generated
      source file.</para>

      <para>At the top of the file, the code fragment is normally used
      to declare the module name and some imports, and that is all it
      should do: don't declare any functions or types in the top code
      fragment, because Alex may need to inject some imports of its
      own into the generated lexer code, and it does this by adding
      them directly after this code fragment in the output
      file.</para>

      <para>Next comes an optional wrapper specification:</para>

<programlisting>wrapper := '%wrapper' @string</programlisting>

      <para>wrappers are described in <xref
      linkend="wrappers"/>.</para>

      <section id="macrodefs">
	<title>Macro definitions</title>

	<para>Next, the lexer specification can contain a series of
	macro definitions.  There are two kinds of macros,
	<firstterm>character set macros</firstterm>, which begin with
	a <literal>$</literal>, and <firstterm>regular expression
	macros</firstterm>, which begin with a <literal>@</literal>.
	A character set macro can be used wherever a character set is
	valid (see <xref linkend="charsets"/>), and a regular
	expression macro can be used wherever a regular expression is
	valid (see <xref linkend="regexps"/>).</para>

<programlisting>macrodef  :=  @smac '=' set
           |  @rmac '=' regexp</programlisting>
      </section>

      <section id="rules">
	<title>Rules</title>

	<para>The rules are heralded by the sequence
	&lsquo;<literal><replaceable>id</replaceable> :-</literal>&rsquo;
        in the file.  It doesn't matter what you use for the
        identifer, it is just there for documentation purposes.  In
	fact, it can be omitted, but the <literal>:-</literal> must be
	left in.</para>

	<para>The syntax of rules is as follows:</para>

<programlisting>rule       := [ startcodes ] token
            | startcodes '{' { token } '}'

token      := [ left_ctx ] regexp [ right_ctx ]  rhs

rhs        := @code | ';'</programlisting>

	<para>Each rule defines one token in the lexical
	specification.  When the input stream matches the regular
	expression in a rule, the Alex lexer will return the value of
	the expression on the right hand side, which we call the
	<firstterm>action</firstterm>.  The action can be any Haskell
	expression.  Alex only places one restriction on actions: all
	the actions must have the same type.  They can be values in a
	token type, for example, or possibly operations in a monad.
	More about how this all works is in <xref
	linkend="api"/>.</para>

	<para>The action may be missing, indicated by replacing it
	with &lsquo;<literal>;</literal>&rsquo;, in which case the
	token will be skipped in the input stream.</para>

	<para>Alex will always find the longest match.  For example,
	if we have a rule that matches whitespace:</para>

<programlisting>$white+        ;</programlisting>

        <para>Then this rule will match as much whitespace at the
        beginning of the input stream as it can.  Be careful: if we
        had instead written this rule as</para>

<programlisting>$white*        ;</programlisting>

	<para>then it would also match the empty string, which would
	mean that Alex could never fail to match a rule!</para>

	<para>When the input stream matches more than one rule, the
	rule which matches the longest prefix of the input stream
	wins.  If there are still several rules which match an equal
	number of characters, then the rule which appears earliest in
	the file wins.</para>

	<section id="contexts">
	  <title>Contexts</title>
    
	  <para>Alex allows a left and right context to be placed on
	  any rule:</para>
	  
<programlisting>
left_ctx   := '^'
            | set '^'

right_ctx  := '$'
            | '/' regexp
            | '/' @code
</programlisting>

	  <para>The left context matches the character which
	  immediately precedes the token in the input stream.  The
	  character immediately preceding the beginning of the stream
	  is assumed to be &lsquo;<literal>\n</literal>&rsquo;.  The
	  special left-context &lsquo;<literal>^</literal>&rsquo; is
	  shorthand for &lsquo;<literal>\n^</literal>&rsquo;.</para>

	  <para>Right context is rather more general.  There are three
	  forms:</para>

	  <variablelist>
	    <varlistentry>
	      <term>
                <literal>/ <replaceable>regexp</replaceable></literal>
              </term>
	      <listitem>
		<para>This right-context causes the rule to match if
 	        and only if it is followed in the input stream by text
 	        which matches
 	        <replaceable>regexp</replaceable>.</para>

		<para>NOTE: this should be used sparingly, because it
		can have a serious impact on performance.  Any time
		this rule <emphasis>could</emphasis> match, its
		right-context will be checked against the current
		input stream.</para>
	      </listitem>
	    </varlistentry>

	    <varlistentry>
	      <term><literal>$</literal></term>
	      <listitem>
		<para>Equivalent to
		&lsquo;<literal>/\n</literal>&rsquo;.</para>
	      </listitem>
	    </varlistentry>

	    <varlistentry>
	      <term><literal>/ { ... }</literal></term>
	      <listitem>
		<para>This form is called a
		<emphasis>predicate</emphasis> on the rule.  The
		Haskell expression inside the curly braces should have
		type:
<programlisting>{ ... } :: user       -- predicate state
        -> AlexInput  -- input stream before the token
        -> Int        -- length of the token
        -> AlexInput  -- input stream after the token
        -> Bool       -- True &lt;=> accept the token</programlisting>
                Alex will only accept the token as matching if
                the predicate returns <literal>True</literal>.</para>

                <para>See <xref linkend="api"/> for the meaning of the
                <literal>AlexInput</literal> type.  The
                <literal>user</literal> argument is available for
                passing into the lexer a special state which is used
                by predicates; to give this argument a value, the
                <literal>alexScanUser</literal> entry point to the
                lexer must be used (see <xref
                linkend="basic-api"/>).</para>
	      </listitem>
	    </varlistentry>
	  </variablelist>
	</section>

	<section id="startcodes">
	  <title>Start codes</title>
	  
	  <para>Start codes are a way of adding state to a lexical
	  specification, such that only certain rules will match for a
	  given state.</para>

	  <para>A startcode is simply an identifer, or the special
	  start code &lsquo;<literal>0</literal>&rsquo;.  Each rule
	  may be given a list of startcodes under which it
	  applies:</para>

<programlisting>startcode  := @id | '0'
startcodes := '&lt;' startcode { ',' startcode } '>'</programlisting>
    
	  <para>When the lexer is invoked to scan the next token from
	  the input stream, the start code to use is also specified
	  (see <xref linkend="api"/>).  Only rules that mention this
	  start code are then enabled.  Rules which do not have a list
	  of startcodes are available all the time.</para>

	  <para>Each distinct start code mentioned in the lexical
	  specification causes a definition of the same name to be
	  inserted in the generated source file, whose value is of
	  type <literal>Int</literal>.  For example, if we mentioned
	  startcodes <literal>foo</literal> and <literal>bar</literal>
	  in the lexical spec, then Alex will create definitions such
	  as:
<programlisting>foo = 1
bar = 2</programlisting>
          in the output file.</para>

	  <para>Another way to think of start codes is as a way to
	  define several different (but possibly overlapping) lexical
	  specifications in a single file, since each start code
	  corresponds to a different set of rules.  In concrete terms,
	  each start code corresponds to a distinct initial state in
	  the state machine that Alex derives from the lexical
	  specification.</para>

	  <para>Here is an example of using startcodes as states, for
	  collecting the characters inside a string:</para>

<programlisting>&lt;0>      ([^\"] | \n)*  ;
&lt;0>      \"             { begin string }
&lt;string> [^\"]          { stringchar }
&lt;string> \"             { begin 0 }</programlisting>

          <para>When it sees a quotation mark, the lexer switches into
          the <literal>string</literal> state and each character
          thereafter causes a <literal>stringchar</literal> action,
          until the next quotation mark is found, when we switch back
          into the <literal>0</literal> state again.</para>
	  
	  <para>From the lexer's point of view, the startcode is just
	  an integer passed in, which tells it which state to start
	  in.  In order to actually use it as a state, you must have
	  some way for the token actions to specify new start codes -
	  <xref linkend="api"/> describes some ways this can be done.
	  In some applications, it might be necessary to keep a
	  <emphasis>stack</emphasis> of start codes, where at the end
	  of a state we pop the stack and resume parsing in the
	  previous state.  If you want this functionality, you have to
	  program it yourself.</para>
	</section>

      </section> <!-- rules -->
    </section> <!-- syntax of alex files -->
  </chapter> <!-- alex files -->

  <chapter id="regexps">
    <title>Regular Expression</title>

    <para>Regular expressions are the patterns that Alex uses to match
    tokens in the input stream.</para>

    <section id="regexp-syntax">
      <title>Syntax of regular expressions</title>

<programlisting>regexp  := rexp2 { '|' rexp2 }

rexp2   := rexp1 { rexp1 }

rexp1   := rexp0 [ '*' | '+' | '?' | repeat ]

rexp0   := set
         | @rmac
         | @string
         | '(' [ regexp ] ')'

repeat  := '{' $digit '}'
         | '{' $digit ',' '}'
         | '{' $digit ',' $digit '}'</programlisting>

      <para>The syntax of regular expressions is fairly standard, the
      only difference from normal lex-style regular expressions being
      that we allow the sequence <literal>()</literal> to denote the
      regular expression that matches the empty string.</para>

      <para>Spaces are ignored in a regular expression, so feel free
      to space out your regular expression as much as you like, even
      split it over multiple lines and include comments.  Literal
      whitespace can be included by surrounding it with quotes
      <literal>"&nbsp;&nbsp;&nbsp;"</literal>, or by escaping each whitespace character
      with <literal>\</literal>.</para>

      <variablelist>
	<varlistentry>
	  <term><literal><replaceable>set</replaceable></literal></term>
	  <listitem>
	    <para>Matches any of the characters in
	    <replaceable>set</replaceable>.  See <xref
	    linkend="charsets"/> for the syntax of sets.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>@foo</literal></term>
	  <listitem>
	    <para>Expands to the definition of the appropriate
	    regular expression macro.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>"..."</literal></term>
	  <listitem>
	    <para>Matches the sequence of characters in the string, in
	    that order.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>r</replaceable>*</literal></term>
	  <listitem>
	    <para>Matches zero or more occurences of
	    <replaceable>r</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>r</replaceable>+</literal></term>
	  <listitem>
	    <para>Matches one or more occurences of
	    <replaceable>r</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>r</replaceable>?</literal></term>
	  <listitem>
	    <para>Matches zero or one occurences of
	    <replaceable>r</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>r</replaceable>{<replaceable>n</replaceable>}</literal></term>
	  <listitem>
	    <para>Matches <replaceable>n</replaceable> occurrences of
	    <replaceable>r</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>r</replaceable>{<replaceable>n</replaceable>,}</literal></term>
	  <listitem>
	    <para>Matches <replaceable>n</replaceable> or more occurrences of
	    <replaceable>r</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>r</replaceable>{<replaceable>n</replaceable>,<replaceable>m</replaceable>}</literal></term>
	  <listitem>
	    <para>Matches between <replaceable>n</replaceable> and
	    <replaceable>m</replaceable> (inclusive) occurrences of
	    <replaceable>r</replaceable>.</para>
	  </listitem>
	</varlistentry>
      </variablelist>
    </section>

    <section id="charsets">
      <title>Syntax of character sets</title>

      <para>Character sets are the fundamental elements in a regular
      expression.  A character set is a pattern that matches a single
      character.  The syntax of character sets is as follows:</para>

<programlisting>set     := set '#' set0
        |  set0

set0    := @char [ '-' @char ]
        | '.'
        |  @smac
        | '[' [^] { set } ']'
        | '~' set0</programlisting>

      <para>The various character set constructions are:</para>
      
      <variablelist>
	<varlistentry>
	  <term><literal><replaceable>char</replaceable></literal></term>
	  <listitem>
	    <para>The simplest character set is a single character.
            Note that special characters such as <literal>[</literal>
            and <literal>.</literal> must be escaped by prefixing them
            with <literal>\</literal> (see the lexical syntax, <xref
            linkend="lexical"/>, for the list of special
            characters).</para>

	    <para>Certain non-printable characters have special escape
            sequences.  These are: <literal>\a</literal>,
            <literal>\b</literal>, <literal>\f</literal>,
            <literal>\n</literal>, <literal>\r</literal>,
            <literal>\t</literal>, and <literal>\v</literal>.  Other
            characters can be represented by using their numerical
            character values (although this may be non-portable):
            <literal>\x0A</literal> is equivalent to
            <literal>\n</literal>, for example.</para>

	    <para>Whitespace characters are ignored; to represent a
	    literal space, escape it with <literal>\</literal>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>char</replaceable>-<replaceable>char</replaceable></literal></term>
	  <listitem>
	    <para>A range of characters can be expressed by separating
            the characters with a &lsquo;<literal>-</literal>&rsquo;,
            all the characters with codes in the given range are
            included in the set.  Character ranges can also be
            non-portable.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>.</literal></term>
	  <listitem>
	    <para>The built-in set &lsquo;<literal>.</literal>&rsquo;
            matches all characters except newline
            (<literal>\n</literal>).</para>

	    <para>Equivalent to the set
	    <literal>[\x00-\xff]&nbsp;#&nbsp;\n</literal>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal><replaceable>set0</replaceable> # <replaceable>set1</replaceable></literal></term>
	  <listitem>
	    <para>Matches all the characters in
	    <replaceable>set0</replaceable> that are not in
	    <replaceable>set1</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>[<replaceable>sets</replaceable>]</literal></term>
	  <listitem>
	    <para>The union of <replaceable>sets</replaceable>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>[^<replaceable>sets</replaceable>]</literal></term>
	  <listitem>
	    <para>The complement of the union of the
	    <replaceable>sets</replaceable>.  Equivalent to
	    &lsquo;<literal>. # [<replaceable>sets</replaceable>]</literal>&rsquo;.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>~<replaceable>set</replaceable></literal></term>
	  <listitem>
	    <para>The complement of <replaceable>set</replaceable>.
	    Equivalent to &lsquo;<literal>. # <replaceable>set</replaceable></literal>&rsquo;</para>
	  </listitem>
	</varlistentry>
      </variablelist>

      <para>A set macro is written as <literal>$</literal> followed by
      an identifier.  There are some builtin character set
      macros:</para>

      <variablelist>
	<varlistentry>
	  <term><literal>$white</literal></term>
	  <listitem>
	    <para>Matches all whitespace characters, including
	    newline.</para>

	    <para>Equivalent to the set
	    <literal>[\ \t\n\f\v\r]</literal>.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>$printable</literal></term>
	  <listitem>
	    <para>Matches all printable characters (characters 32 to
	    126 in ASCII).  Equivalent to the set
	    <literal>[\32-\126]</literal>.</para>
	  </listitem>
	</varlistentry>
      </variablelist>
      
      <para>Character set macros can be defined at the top of the file
      at the same time as regular expression macros (see <xref
      linkend="regexps"/>).  Here are some example character set
      macros:</para>

<programlisting>$lls      = a-z                   -- little letters
$not_lls  = ~a-z                  -- anything but little letters
$ls_ds    = [a-zA-Z0-9]           -- letters and digits
$sym      = [ \! \@ \# \$ ]       -- the symbols !, @, #, and $
$sym_q_nl = [ \' \! \@ \# \$ \n ] -- the above symbols with ' and newline
$quotable = $printable # \'       -- any graphic character except '
$del      = \127                  -- ASCII DEL</programlisting>
    </section>

  </chapter>

  <chapter id="api">
    <title>The Interface to an Alex-generated lexer</title>

    <para>This section answers the question: "How do I include an
    Alex lexer in my program?"</para>

    <para>Alex provides for a great deal of flexibility in how the
    lexer is exposed to the rest of the program.  For instance,
    there's no need to parse a <literal>String</literal> directly if
    you have some special character-buffer operations that avoid the
    overheads of ordinary Haskell <literal>String</literal>s.  You
    might want Alex to keep track of the line and column number in the
    input text, or you might wish to do it yourself (perhaps you use a
    different tab width from the standard 8-columns, for
    example).</para>

    <para>The general story is this: Alex provides a basic interface
    to the generated lexer (described in the next section), which you
    can use to parse tokens given an abstract input type with
    operations over it.  You also have the option of including a
    <firstterm>wrapper</firstterm>, which provides a higher-level
    abstraction over the basic interface; Alex comes with several
    wrappers.</para>

    <section id="basic-api">
      <title>Basic interface</title>

      <para>If you compile your Alex file without a
      <literal>%wrapper</literal> declaration, then you get access to
      the lowest-level API to the lexer.  You must provide definitions
      for the following, either in the same module or imported from
      another module:</para>

<programlisting>type AlexInput
alexGetChar       :: AlexInput -> Maybe (Char,AlexInput)
alexInputPrevChar :: AlexInput -> Char</programlisting>

      <para>The generated lexer is independent of the input type,
      which is why you have to provide a definition for the input type
      yourself.  Note that the input type needs to keep track of the
      <emphasis>previous</emphasis> character in the input stream;
      this is used for implementing patterns with a left-context
      (those that begin with <literal>^</literal> or
      <literal><replaceable>set</replaceable>^</literal>).  If you
      don't ever use patterns with a left-context in your lexical
      specification, then you can safely forget about the previous
      character in the input stream, and have
      <literal>alexInputPrevChar</literal> return
      <literal>undefined</literal>.</para>

      <para>Alex will provide the following function:</para>

<programlisting>alexScan :: AlexInput             -- The current input
         -> Int                   -- The "start code"
         -> AlexReturn action     -- The return value

data AlexReturn action
  = AlexEOF

  | AlexError
      !AlexInput     -- Remaining input

  | AlexSkip
      !AlexInput     -- Remaining input
      !Int           -- Token length

  | AlexToken  
      !AlexInput     -- Remaining input
      !Int           -- Token length
      action         -- action value</programlisting>

      <para>Calling <literal>alexScan</literal> will scan a single
      token from the input stream, and return a value of type
      <literal>AlexReturn</literal>.  The value returned is either:</para>

      <variablelist>
	<varlistentry>
	  <term><literal>AlexEOF</literal></term>
	  <listitem>
	    <para>The end-of-file was reached.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>AlexError</literal></term>
	  <listitem>
	    <para>A valid token could not be recognised.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>AlexSkip</literal></term>
	  <listitem>
	    <para>The matched token did not have an action associated
	    with it.</para>
	  </listitem>
	</varlistentry>

	<varlistentry>
	  <term><literal>AlexToken</literal></term>
	  <listitem>
	    <para>A token was matched, and the action associated with
	    it is returned.</para>
	  </listitem>
	</varlistentry>
      </variablelist>

      <para>The <literal>action</literal> is simply the value of the
      expression inside <literal>{...}</literal> on the
      right-hand-side of the appropriate rule in the Alex file.
      Alex doesn't specify what type these expressions should have, it
      simply requires that they all have the same type, or else you'll
      get a type error when you try to compile the generated
      lexer.</para>

      <para>Once you have the <literal>action</literal>, it is up to
      you what to do with it.  The type of <literal>action</literal>
      could be a function which takes the <literal>String</literal>
      representation of the token and returns a value in some token
      type, or it could be a continuation that takes the new input and
      calls <literal>alexScan</literal> again, building a list of
      tokens as it goes.</para>

      <para>This is pretty low-level stuff; you have complete
      flexibility about how you use the lexer, but there might be a
      fair amount of support code to write before you can actually use
      it.  For this reason, we also provide a selection of wrappers
      that add some common functionality to this basic scheme.
      Wrappers are described in the next section.</para>

      <para>There is another entry point, which is useful if your
      grammar contains any predicates (see <xref
      linkend="contexts"/>):</para>

<programlisting>alexScanUser
         :: user             -- predicate state
         -> AlexInput        -- The current input
         -> Int              -- The "start code"
         -> Maybe (          -- Nothing on error or EOF
                 AlexInput,  -- The remaining input
                 Int,        -- Length of this token
                 action      -- The action (an unknown type)
              )</programlisting>

      <para>The extra argument, of some type <literal>user</literal>,
      is passed to each predicate.</para>
    </section>

    <section id="wrappers">
      <title>Wrappers</title>

      <para>To use one of the provided wrappers, include the following
      declaration in your file:</para>

<programlisting>%wrapper "<replaceable>name</replaceable>"</programlisting>

      <para>where <replaceable>name</replaceable> is the name of the
      wrapper, eg. <literal>basic</literal>.  The following sections
      describe each of the wrappers that come with Alex.</para>

      <section>
	<title>The "basic" wrapper</title>

	<para>The basic wrapper is a good way to obtain a function of
	type <literal>String -> [token]</literal> from a lexer
	specification, with little fuss.</para>

	<para>It provides definitions for
	<literal>AlexInput</literal>, <literal>alexGetChar</literal>
	and <literal>alexInputPrevChar</literal> that are suitable for
	lexing a <literal>String</literal> input.  It also provides a
	function <literal>alexScanTokens</literal> which takes a
	<literal>String</literal> input and returns a list of the
	tokens it contains.</para>

	<para>The <literal>basic</literal> wrapper provides no support
	for using startcodes; the initial startcode is always set to
	zero.</para>

	<para>Here is the actual code included in the lexer when the
	basic wrapper is selected:</para>

<programlisting>type AlexInput = (Char,     -- previous char
                  String)   -- current input string

alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (_, [])   = Nothing
alexGetChar (_, c:cs) = Just (c, (c,cs))

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (c,_) = c

-- alexScanTokens :: String -> [token]
alexScanTokens str = go ('\n',str)
  where go inp@(_,str) =
          case alexScan inp 0 of
                AlexEOF -> []
                AlexError _ -> error "lexical error"
                AlexSkip  inp' len     -> go inp'
                AlexToken inp' len act -> act (take len str) : go inp'</programlisting>

	<para>The type signature for <literal>alexScanTokens</literal>
        is commented out, because the <literal>token</literal> type is
        unkonwn.  All of the actions in your lexical specification
        should have type:</para>

<programlisting>{ ... } :: String -> token</programlisting>

        <para>for some type <literal>token</literal>.</para>

	<para>For an example of the use of the basic wrapper, see the
	file <literal>examples/Tokens_basic.x</literal> in the Alex
	distribution.</para>
      </section>

      <section>
	<title>The "posn" wrapper</title>

	<para>The posn wrapper provides slightly more functionality
	than the basic wrapper: it keeps track of line and column
	numbers of tokens in the input text.</para>

	<para>The posn wrapper provides the following, in addition to
	the straightforward definitions of
	<literal>alexGetChar</literal> and
	<literal>alexInputPrevChar</literal>:</para>

<programlisting>data AlexPosn = AlexPn !Int  -- absolute character offset
                       !Int  -- line number
                       !Int  -- column number

type AlexInput = (AlexPosn, -- current position,
                  Char,     -- previous char
                  String)   -- current input string

--alexScanTokens :: String -> [token]
alexScanTokens str = go (alexStartPos,'\n',str)
  where go inp@(pos,_,str) =
          case alexScan inp 0 of
                AlexEOF -> []
                AlexError _ -> error "lexical error"
                AlexSkip  inp' len     -> go inp'
                AlexToken inp' len act -> act pos (take len str) : go inp'</programlisting>

	<para>The types of the token actions should be:</para>

<programlisting>{ ... } :: AlexPosn -> String -> token</programlisting>

	<para>For an example using the <literal>posn</literal>
	wrapper, see the file
	<literal>examples/Tokens_posn.x</literal> in the Alex
	distribution.</para>
      </section>

      <section>
	<title>The "monad" wrapper</title>

	<para>The <literal>monad</literal> wrapper is the most
	flexible of the wrappers provided with Alex.  It includes a
	state monad which keeps track of the current input and text
	position, and the startcode.  It is intended to be a template
	for building your own monads - feel free to copy the code and
	modify it to build a monad with the facilities you
	need.</para>

<programlisting>data AlexState = AlexState {
        alex_pos :: !AlexPosn,  -- position at current input location
        alex_inp :: String,     -- the current input
        alex_chr :: !Char,      -- the character before the input
        alex_scd :: !Int        -- the current startcode
    }

newtype Alex a = Alex { unAlex :: AlexState
                               -> Either String (AlexState, a) }

runAlex          :: String -> Alex a -> Either String a

alexGetInput     :: Alex AlexInput
alexSetInput     :: AlexInput -> Alex ()

alexError        :: String -> Alex a

alexGetStartCode :: Alex Int
alexSetStartCode :: Int -> Alex ()</programlisting>

	<para>To invoke a scanner under the <literal>monad</literal>
	wrapper, use <literal>alexMonadScan</literal>:</para>

<programlisting>alexMonadScan :: Alex result</programlisting>

	<para>The token actions should have the following type:</para>

<programlisting>type AlexAction result = AlexInput -> Int -> Alex result
{ ... }  :: AlexAction result</programlisting>

	<para>The <literal>monad</literal> wrapper also provides some
	useful combinators for constructing token actions:</para>

<programlisting>-- skip :: AlexAction result
skip input len = alexMonadScan

-- andBegin :: AlexAction result -> Int -> AlexAction result
(act `andBegin` code) input len = do alexSetStartCode code; act input len

-- begin :: Int -> AlexAction result
begin code = skip `andBegin` code

-- token :: (String -> Int -> token) -> AlexAction token
token t input len = return (t input len)</programlisting>
      </section>

      <section>
	<title>The "monadUserState" wrapper</title>

	<para>The <literal>monadUserState</literal> wrapper is built 
    upon the <literal>monad</literal> wrapper. It includes a reference
    to a type which must be defined in the user's program, 
    <literal>AlexUserState</literal>, and a call to an initialization
    function which must also be defined in the user's program,
    <literal>alexInitUserState</literal>. It gives great flexibility
    because it is now possible to add any needed information and carry
    it during the whole lexing phase.</para>

    <para>The generated code is the same as in the <literal>monad</literal> 
    wrapper, except in 2 places:</para>
    <para>1) The definition of the general state, which now refers to a
    type (<literal>AlexUserState</literal>) that must be defined in the Alex file.</para>

<programlisting>data AlexState = AlexState {
        alex_pos :: !AlexPosn,  -- position at current input location
        alex_inp :: String,     -- the current input
        alex_chr :: !Char,      -- the character before the input
        alex_scd :: !Int        -- the current startcode
      , alex_ust :: AlexUserState -- AlexUserState will be defined in the user program
    }
</programlisting>

    <para>2) The initialization code, where a user-specified routine (<literal>alexInitUserState</literal>) will be 
    called.</para>

<programlisting>runAlex :: String -> Alex a -> Either String a
runAlex input (Alex f)
   = case f (AlexState {alex_pos = alexStartPos,
                        alex_inp = input,
                        alex_chr = '\n',
                        alex_ust = alexInitUserState,
                        alex_scd = 0}) of Left msg -> Left msg
                                          Right ( _, a ) -> Right a
</programlisting>

    <para>Here is an example of code in the user's Alex file defining
    the type and function:</para>

<programlisting>data AlexUserState = AlexUserState
                   {
                       lexerCommentDepth  :: Int
                     , lexerStringValue   :: String
                   }

alexInitUserState :: AlexUserState
alexInitUserState = AlexUserState
                   {
                       lexerCommentDepth  = 0
                     , lexerStringValue   = ""
                   }

getLexerCommentDepth :: Alex Int
getLexerCommentDepth = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, lexerCommentDepth ust)

setLexerCommentDepth :: Int -> Alex ()
setLexerCommentDepth ss = Alex $ \s -> Right (s{alex_ust=(alex_ust s){lexerCommentDepth=ss}}, ())

getLexerStringValue :: Alex String
getLexerStringValue = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, lexerStringValue ust)

setLexerStringValue :: String -> Alex ()
setLexerStringValue ss = Alex $ \s -> Right (s{alex_ust=(alex_ust s){lexerStringValue=ss}}, ())

addCharToLexerStringValue :: Char -> Alex ()
addCharToLexerStringValue c = Alex $ \s -> Right (s{alex_ust=(alex_ust s){lexerStringValue=c:lexerStringValue (alex_ust s)}}, ())
</programlisting>
      </section>

      <section>
	<title>The "gscan" wrapper</title>

	<para>The <literal>gscan</literal> wrapper is provided mainly
	for historical reasons: it exposes an interface which is very
	similar to that provided by Alex version 1.x.  The interface
	is intended to be very general, allowing actions to modify the
	startcode, and pass around an arbitrary state value.</para>

<programlisting>alexGScan :: StopAction state result -> state -> String -> result

type StopAction state result 
         = AlexPosn -> Char -> String -> (Int,state) -> result</programlisting>    

	<para>The token actions should all have this type:</para>

<programlisting>{ ... }      :: AlexPosn                -- token position
             -> Char                    -- previous character
             -> String                  -- input string at token
             -> Int                     -- length of token
             -> ((Int,state) -> result) -- continuation
             -> (Int,state)             -- current (startcode,state)
             -> result</programlisting>    
      </section>
      <section>
        <title>The bytestring wrappers</title>

	<para>The <literal>basic-bytestring</literal>,
	<literal>posn-bytestring</literal> and
	<literal>monad-bytestring</literal> wrappers are variations on the
	<literal>basic</literal>, <literal>posn</literal> and
	<literal>monad</literal> wrappers that use lazy
	<literal>ByteString</literal>s as the input and token types instead of
	an ordinary <literal>String</literal>.</para>
	
	<para>The point of using these wrappers is that
	<literal>ByteString</literal>s provide a more memory efficient
	representaion of an input stream. They can also be somewhat faster to
	process. Note however that they only treat the input string as 8-bit
	ASCII characters. Note also that using these wrappers adds a dependency
	on the <literal>ByteString</literal> modules, which live in the
	<literal>bytestring</literal> package (or in the
	<literal>base</literal> package in <literal>ghc-6.6</literal>)</para>
	
	<para>Do note that <literal>token</literal> provides a
	<emphasis>lazy</emphasis> <literal>ByteString</literal> which is not
	the most compact representation for short strings. You may want to
	convert to a strict <literal>ByteString</literal> or perhaps something
	more compact still. Note also that by default tokens share space with
	the input <literal>ByteString</literal> which has the advantage that it
	does not need to make a copy but it also prevents the input from being
	garbage collected. It may make sense in some applications to use
	<literal>ByteString</literal>'s <literal>copy</literal> function to
	unshare tokens that will be kept for a long time, to allow the original
	input to be collected.</para>

	<section>
	<title>The "basic-bytestring" wrapper</title>
	<para>The <literal>basic-bytestring</literal> wrapper is the same as
	the <literal>basic</literal> wrapper but with lazy
	<literal>ByteString</literal> instead of <literal>String</literal>:</para>

<programlisting>
import qualified Data.ByteString.Lazy.Char8 as ByteString

type AlexInput = (Char,       -- previous char
                  ByteString) -- current input string

alexGetChar :: AlexInput -> Maybe (Char,AlexInput)

alexInputPrevChar :: AlexInput -> Char

-- alexScanTokens :: String -> [token]
</programlisting>

	<para>All of the actions in your lexical specification
        should have type:</para>

<programlisting>{ ... } :: ByteString -> token</programlisting>

        <para>for some type <literal>token</literal>.</para>
	</section>

	<section>
	<title>The "posn-bytestring" wrapper</title>
	<para>The <literal>posn-bytestring</literal> wrapper is the same as
	the <literal>posn</literal> wrapper but with lazy
	<literal>ByteString</literal> instead of <literal>String</literal>:</para>

<programlisting>
import qualified Data.ByteString.Lazy.Char8 as ByteString

type AlexInput = (AlexPosn,   -- current position,
                  Char,       -- previous char
                  ByteString) -- current input string

-- alexScanTokens :: ByteString -> [token]
</programlisting>

	<para>All of the actions in your lexical specification
        should have type:</para>

<programlisting>{ ... } :: AlexPosn -> ByteString -> token</programlisting>

        <para>for some type <literal>token</literal>.</para>
	</section>

	<section>
	<title>The "monad-bytestring" wrapper</title>
	<para>The <literal>monad-bytestring</literal> wrapper is the same as
	the <literal>monad</literal> wrapper but with lazy
	<literal>ByteString</literal> instead of <literal>String</literal>:</para>

<programlisting>
import qualified Data.ByteString.Lazy.Char8 as ByteString

ata AlexState = AlexState {
        alex_pos :: !AlexPosn,  -- position at current input location
        alex_inp :: ByteString, -- the current input
        alex_chr :: !Char,      -- the character before the input
        alex_scd :: !Int        -- the current startcode
    }

newtype Alex a = Alex { unAlex :: AlexState
                               -> Either String (AlexState, a) }

runAlex          :: ByteString -> Alex a -> Either String a

-- token :: (ByteString -> Int -> token) -> AlexAction token
</programlisting>

	<para>All of the actions in your lexical specification
        have the same type as in the <literal>monad</literal> wrapper. It is
	only the types of the function to run the monad and the type of the
	<literal>token</literal> function that change.</para>
	</section>

	<section>
	<title>The "monadUserState-bytestring" wrapper</title>
	<para>The <literal>monadUserState-bytestring</literal> wrapper is the same as
	the <literal>monadUserState</literal> wrapper but with lazy
	<literal>ByteString</literal> instead of <literal>String</literal>:</para>

<programlisting>
import qualified Data.ByteString.Lazy.Char8 as ByteString

ata AlexState = AlexState {
        alex_pos :: !AlexPosn,  -- position at current input location
        alex_inp :: ByteString, -- the current input
        alex_chr :: !Char,      -- the character before the input
        alex_scd :: !Int        -- the current startcode
      , alex_ust :: AlexUserState -- AlexUserState will be defined in the user program
    }

newtype Alex a = Alex { unAlex :: AlexState
                               -> Either String (AlexState, a) }

runAlex          :: ByteString -> Alex a -> Either String a

-- token :: (ByteString -> Int -> token) -> AlexAction token
</programlisting>

	<para>All of the actions in your lexical specification
        have the same type as in the <literal>monadUserState</literal> wrapper. It is
	only the types of the function to run the monad and the type of the
	<literal>token</literal> function that change.</para>
	</section>
      </section>
    </section>
  </chapter>

  <chapter id="invoking">
    <title>Invoking Alex</title>

    <para>The command line syntax for Alex is entirely
    standard:</para>

<screen>$ alex { <replaceable>option</replaceable> } <replaceable>file</replaceable>.x  { <replaceable>option</replaceable> }</screen>

    <para>Alex expects a single
    <literal><replaceable>file</replaceable>.x</literal> to be named
    on the command line.  By default, Alex will create
    <literal><replaceable>file</replaceable>.hs</literal> containing
    the Haskell source for the lexer.</para>

    <para>The options that Alex accepts are listed below:</para>

    <variablelist>
      <varlistentry>
	<term><option>-o</option> <replaceable>file</replaceable></term>
	<term><option>--outfile</option>=<replaceable>file</replaceable></term>
	<listitem>
	  <para>Specifies the filename in which the output is to be
	  placed.  By default, this is the name of the input file with
	  the <literal>.x</literal> suffix replaced by
	  <literal>.hs</literal>.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-i</option> <optional><replaceable>file</replaceable></optional></term>
	<term><option>--info</option> <optional><replaceable>=file</replaceable></optional></term>
	<listitem>
	  <para>Produces a human-readable rendition of the state
	  machine (DFA) that Alex derives from the lexer, in
	  <replaceable>file</replaceable> (default:
	  <literal><replaceable>file</replaceable>.info</literal>
	  where the input file is
	  <literal><replaceable>file</replaceable>.x</literal>).</para>

	  <para>The format of the info file is currently a bit basic,
	  and not particularly informative.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-t</option> <optional><replaceable>dir</replaceable></optional></term>
	<term><option>--template</option>=<replaceable>dir</replaceable></term>
	<listitem>
	  <para>Look in <replaceable>dir</replaceable> for template files.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-g</option></term>
	<term><option>--ghc</option></term>
	<listitem>
	  <para>Causes Alex to produce a lexer which is optimised for
          compiling with GHC.  The lexer will be significantly more
	  efficient, both in terms of the size of the compiled
	  lexer and its runtime.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-d</option></term>
	<term><option>--debug</option></term>
	<listitem>
	  <para>Causes Alex to produce a lexer which will output
	  debugging messsages as it runs.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-?</option></term>
	<term><option>--help</option></term>
	<listitem>
	  <para>Display help and exit.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-V</option></term>
	<term><option>--version</option></term>
	<listitem>
	  <para>Output version information and exit. Note that for legacy
	  reasons <option>-v</option> is supported, too, but the use of it
	  is deprecated. <option>-v</option> will be used for verbose mode
	  when it is actually implemented.</para>
	</listitem>
      </varlistentry>
    </variablelist>
  </chapter>

</book>