[m-rev.] diff: fix parsing_utils.src_to_line_numbers for non-ASCII input

Peter Wang novalazy at gmail.com
Mon Sep 5 16:16:17 AEST 2011


Branches: main, 11.07

parsing_utils.src_to_line_numbers crashed if given an input string
containing multi-code-unit characters, i.e. non-ASCII.

library/parsing_utils.m:
	Rewrite src_to_line_numbers.

tests/general/test_parsing_utils.exp:
tests/general/test_parsing_utils.m:
	Add test case.

tests/general/test_parsing_utils.exp2:
	Add expected output for grades using UTF-16 string encoding.

diff --git a/library/parsing_utils.m b/library/parsing_utils.m
index 454dd05..595da30 100644
--- a/library/parsing_utils.m
+++ b/library/parsing_utils.m
@@ -437,13 +437,23 @@ skip_whitespace(Src, PS0, PS) :-
 
 src_to_line_numbers(Src) = LineNos :-
     Str = Src ^ input_string,
-    Lo = 0,
-    Hi = Src ^ input_length - 1,
-    F = ( func(I, Ns) =
-        ( if string.unsafe_index(Str, I) = ('\n') then [I | Ns] else Ns )
-    ),
-    LineNosList = int.fold_down(F, Lo, Hi, [Src ^ input_length]),
-    LineNos = array(LineNosList).
+    src_to_line_numbers_2(Str, 0, [], RevLineNosList),
+    LineNos = array.from_reverse_list(RevLineNosList).
+
+:- pred src_to_line_numbers_2(string::in, int::in,
+    list(int)::in, list(int)::out) is det.
+
+src_to_line_numbers_2(Str, Pos0, !RevLineNosList) :-
+    ( string.unsafe_index_next(Str, Pos0, Pos, Char) ->
+        ( Char = '\n' ->
+            !:RevLineNosList = [Pos0 | !.RevLineNosList]
+        ;
+            true
+        ),
+        src_to_line_numbers_2(Str, Pos, !RevLineNosList)
+    ;
+        !:RevLineNosList = [Pos0 | !.RevLineNosList]
+    ).
 
 %-----------------------------------------------------------------------------%
 
diff --git a/tests/general/test_parsing_utils.exp b/tests/general/test_parsing_utils.exp
index 977e8be..fa22c89 100644
--- a/tests/general/test_parsing_utils.exp
+++ b/tests/general/test_parsing_utils.exp
@@ -269,6 +269,7 @@ pass: one_or_more(int_with_state) on "abc"
 pass: one_or_more(int_with_state) on "1 2 3"
 	returned [3, 2, 1] as expected
 	[5 chars consumed]
+--
 Line = 2, Pos = 5
 Line = 2, Pos = 3
 Line = 7, Pos = 1
@@ -279,6 +280,18 @@ Line = 1, Pos = 1
 Line = 2, Pos = 10
 Line = 3, Pos = 1
 Line = 1, Pos = 1
+--
+Line = 1, Pos = 1
+Line = 1, Pos = 2
+Line = 1, Pos = 3
+Line = 1, Pos = 4
+Line = 1, Pos = 5
+Line = 2, Pos = 1
+Line = 2, Pos = 2
+Line = 2, Pos = 3
+Line = 2, Pos = 4
+Line = 3, Pos = 1
+--
 expecting an operator
 12 + x-pow(x + 3; y)
                 ^
diff --git a/tests/general/test_parsing_utils.exp2 b/tests/general/test_parsing_utils.exp2
new file mode 100644
index 0000000..6f05593
--- /dev/null
+++ b/tests/general/test_parsing_utils.exp2
@@ -0,0 +1,315 @@
+pass: next_char on ""
+	failed as expected
+pass: next_char on "123"
+	returned '1' as expected
+	[1 chars consumed]
+pass: char_in_class("123") on ""
+	failed as expected
+pass: char_in_class("123") on "abc"
+	failed as expected
+pass: char_in_class("123") on "123"
+	returned '1' as expected
+	[1 chars consumed]
+pass: punct("!") on ""
+	failed as expected
+pass: punct("!") on "abc"
+	failed as expected
+pass: punct("!") on "*"
+	failed as expected
+pass: punct("!") on "!"
+	returned unit as expected
+	[1 chars consumed]
+pass: keyword("ABC", "ABC") on ""
+	failed as expected
+pass: keyword("ABC", "ABC") on "123"
+	failed as expected
+pass: keyword("ABC", "ABC") on "ABCA"
+	failed as expected
+pass: keyword("ABC", "ABC") on "ABC 123"
+	returned unit as expected
+	[4 chars consumed]
+pass: keyword("αβγ", "αβγ") on ""
+	failed as expected
+pass: keyword("αβγ", "αβγ") on "123"
+	failed as expected
+pass: keyword("αβγ", "αβγ") on "αβγα"
+	failed as expected
+pass: keyword("αβγ", "αβγ") on "αβγ 123"
+	returned unit as expected
+	[4 chars consumed]
+pass: keyword("ABC", "ABC") on "abc 123"
+	returned unit as expected
+	[4 chars consumed]
+pass: ikeyword("αβγ", "αβγ") on "αβγ 123"
+	returned unit as expected
+	[4 chars consumed]
+pass: identifier("ABC", "ABCabc_") on ""
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "abc"
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "_"
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "A"
+	returned "A" as expected
+	[1 chars consumed]
+pass: identifier("ABC", "ABCabc_") on "Ab_c"
+	returned "Ab_c" as expected
+	[4 chars consumed]
+pass: identifier("ABC", "ABCabc_") on "*"
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "Abc !"
+	returned "Abc" as expected
+	[4 chars consumed]
+pass: identifier("αβγ", "αβγ_") on ""
+	failed as expected
+pass: identifier("αβγ", "αβγ_") on "abc"
+	failed as expected
+pass: identifier("αβγ", "αβγ_") on "_"
+	failed as expected
+pass: identifier("αβγ", "αβγ_") on "α"
+	returned "α" as expected
+	[1 chars consumed]
+pass: identifier("αβγ", "αβγ_") on "αβ_γ"
+	returned "αβ_γ" as expected
+	[4 chars consumed]
+pass: whitespace on ""
+	returned unit as expected
+	[0 chars consumed]
+pass: whitespace on "123"
+	returned unit as expected
+	[0 chars consumed]
+pass: whitespace on "   "
+	returned unit as expected
+	[3 chars consumed]
+pass: whitespace on "   123"
+	returned unit as expected
+	[3 chars consumed]
+pass: skip_to_eol on ""
+	failed as expected
+pass: skip_to_eol on "blah blah
+"
+	returned unit as expected
+	[10 chars consumed]
+pass: skip_to_eol on "blah blah
+123"
+	returned unit as expected
+	[10 chars consumed]
+pass: eof on "123"
+	failed as expected
+pass: eof on ""
+	returned unit as expected
+	[0 chars consumed]
+pass: float_literal_as_string on ""
+	failed as expected
+pass: float_literal_as_string on "abc"
+	failed as expected
+pass: float_literal_as_string on "123"
+	failed as expected
+pass: float_literal_as_string on "123.0   abc"
+	returned "123.0" as expected
+	[8 chars consumed]
+pass: float_literal_as_string on "123.0e1   abc"
+	returned "123.0e1" as expected
+	[10 chars consumed]
+pass: float_literal_as_string on "-123.0   abc"
+	returned "-123.0" as expected
+	[9 chars consumed]
+pass: float_literal_as_string on "-123.0e1   abc"
+	returned "-123.0e1" as expected
+	[11 chars consumed]
+pass: float_literal_as_string on "-123.0E-1   abc"
+	returned "-123.0E-1" as expected
+	[12 chars consumed]
+pass: float_literal on ""
+	failed as expected
+pass: float_literal on "abc"
+	failed as expected
+pass: float_literal on "123"
+	failed as expected
+pass: float_literal on "123.0   abc"
+	returned 123.0 as expected
+	[8 chars consumed]
+pass: float_literal on "123.0e1   abc"
+	returned 1230.0 as expected
+	[10 chars consumed]
+pass: float_literal on "-123.0   abc"
+	returned -123.0 as expected
+	[9 chars consumed]
+pass: float_literal on "-123.0e1   abc"
+	returned -1230.0 as expected
+	[11 chars consumed]
+pass: float_literal on "-123.0E-1   abc"
+	returned -12.3 as expected
+	[12 chars consumed]
+pass: int_literal_as_string on ""
+	failed as expected
+pass: int_literal_as_string on "abc"
+	failed as expected
+pass: int_literal_as_string on "123.0"
+	failed as expected
+pass: int_literal_as_string on "123   abc"
+	returned "123" as expected
+	[6 chars consumed]
+pass: int_literal_as_string on "-123   abc"
+	returned "-123" as expected
+	[7 chars consumed]
+pass: int_literal_as_string on "999999999999999999999   abc"
+	returned "999999999999999999999" as expected
+	[24 chars consumed]
+pass: int_literal on ""
+	failed as expected
+pass: int_literal on "abc"
+	failed as expected
+pass: int_literal on "123.0"
+	failed as expected
+pass: int_literal on "123   abc"
+	returned 123 as expected
+	[6 chars consumed]
+pass: int_literal on "-123   abc"
+	returned -123 as expected
+	[7 chars consumed]
+pass: int_literal on "999999999999999999999   abc"
+	failed as expected
+pass: string_literal('\"') on ""
+	failed as expected
+pass: string_literal('\"') on ""123"   abc"
+	returned "123" as expected
+	[8 chars consumed]
+pass: string_literal('\"') on ""1\"2\"3"   abc"
+	returned "1\\\"2\\\"3" as expected
+	[12 chars consumed]
+pass: string_literal('\'') on ""
+	failed as expected
+pass: string_literal('\'') on "'123'   abc"
+	returned "123" as expected
+	[8 chars consumed]
+pass: string_literal('\'') on "'1\'2\'3'   abc"
+	returned "1\\\'2\\\'3" as expected
+	[12 chars consumed]
+pass: string_literal('‖') on ""
+	failed as expected
+pass: string_literal('‖') on "‖123‖   abc"
+	returned "123" as expected
+	[8 chars consumed]
+pass: string_literal('‖') on "‖αβγ‖   abc"
+	returned "αβγ" as expected
+	[8 chars consumed]
+pass: optional(punct("!")) on ""
+	returned no as expected
+	[0 chars consumed]
+pass: optional(punct("!")) on "abc"
+	returned no as expected
+	[0 chars consumed]
+pass: optional(punct("!")) on "!   "
+	returned yes(unit) as expected
+	[4 chars consumed]
+pass: zero_or_more(punct("!")) on ""
+	returned [] as expected
+	[0 chars consumed]
+pass: zero_or_more(punct("!")) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: zero_or_more(punct("!")) on "!!!   abc"
+	returned [unit, unit, unit] as expected
+	[6 chars consumed]
+pass: one_or_more(punct("!")) on ""
+	failed as expected
+pass: one_or_more(punct("!")) on "abc"
+	failed as expected
+pass: one_or_more(punct("!")) on "!!!   abc"
+	returned [unit, unit, unit] as expected
+	[6 chars consumed]
+pass: brackets("(", ")", punct("!")) on ""
+	failed as expected
+pass: brackets("(", ")", punct("!")) on "abc"
+	failed as expected
+pass: brackets("(", ")", punct("!")) on "(abc)"
+	failed as expected
+pass: brackets("(", ")", punct("!")) on "(!)   abc"
+	returned unit as expected
+	[6 chars consumed]
+pass: separated_list("+", punct("!")) on ""
+	returned [] as expected
+	[0 chars consumed]
+pass: separated_list("+", punct("!")) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: separated_list("+", punct("!")) on "!   abc"
+	returned [unit] as expected
+	[4 chars consumed]
+pass: separated_list("+", punct("!")) on "!+ ! + !   abc"
+	returned [unit, unit, unit] as expected
+	[11 chars consumed]
+pass: comma_separated_list(punct("!")) on ""
+	returned [] as expected
+	[0 chars consumed]
+pass: comma_separated_list(punct("!")) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: comma_separated_list(punct("!")) on "!   abc"
+	returned [unit] as expected
+	[4 chars consumed]
+pass: comma_separated_list(punct("!")) on "!, ! , !   abc"
+	returned [unit, unit, unit] as expected
+	[11 chars consumed]
+pass: optional(int_with_state) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: optional(int_with_state) on "1"
+	returned [1] as expected
+	[1 chars consumed]
+pass: zero_or_more(int_with_state) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: zero_or_more(int_with_state) on "1 2 3"
+	returned [3, 2, 1] as expected
+	[5 chars consumed]
+pass: one_or_more(int_with_state) on "abc"
+	failed as expected
+pass: one_or_more(int_with_state) on "1 2 3"
+	returned [3, 2, 1] as expected
+	[5 chars consumed]
+--
+Line = 2, Pos = 5
+Line = 2, Pos = 3
+Line = 7, Pos = 1
+Line = 4, Pos = 2
+Line = 3, Pos = 1
+Line = 1, Pos = 3
+Line = 1, Pos = 1
+Line = 2, Pos = 10
+Line = 3, Pos = 1
+Line = 1, Pos = 1
+--
+Line = 1, Pos = 1
+Line = 1, Pos = 2
+Line = 1, Pos = 3
+Line = 1, Pos = 4
+Line = 2, Pos = 1
+Line = 2, Pos = 2
+Line = 3, Pos = 1
+Line = 4, Pos = 1
+Line = 4, Pos = 2
+Line = 4, Pos = 3
+--
+expecting an operator
+12 + x-pow(x + 3; y)
+                ^
+syntax error
+abs(x ++ 3)
+       ^
+expecting an operator
+abs (x))
+       ^
+unknown function: f
+1 + 3 MoD 2 + f(3 + x)
+              ^
+expecting an operator
+1 + /* comment */ 3 mody 2 + f(3 + x)
+                    ^
+expecting an operator
+1 + 1x
+     ^
+unterminated comment
+1 + 2 /* blah blah ...
+                      ^
diff --git a/tests/general/test_parsing_utils.m b/tests/general/test_parsing_utils.m
index 2ca65f5..e9f0460 100644
--- a/tests/general/test_parsing_utils.m
+++ b/tests/general/test_parsing_utils.m
@@ -33,6 +33,7 @@
 
 main(!IO) :-
     unsorted_aggregate(run_test, io.write_string, !IO),
+    io.write_string("--\n", !IO),
     test_pos("123456789\n123456789\n", 14, !IO),
     test_pos("\n123456789\n123456789\n\n\n\n\n\n", 3, !IO),
     test_pos("\n1234\n12\n\n\nfewefwef\nwwfwe\n\n", 20, !IO),
@@ -43,6 +44,18 @@ main(!IO) :-
     test_pos("123456789\n123456789\n\n", 19, !IO),
     test_pos("123456789\n123456789\n\n", 20, !IO),
     test_pos("", 0, !IO),
+    io.write_string("--\n", !IO),
+    test_pos("ábc\n☿\n\n", 0, !IO),
+    test_pos("ábc\n☿\n\n", 1, !IO),
+    test_pos("ábc\n☿\n\n", 2, !IO),
+    test_pos("ábc\n☿\n\n", 3, !IO),
+    test_pos("ábc\n☿\n\n", 4, !IO),
+    test_pos("ábc\n☿\n\n", 5, !IO),
+    test_pos("ábc\n☿\n\n", 6, !IO),
+    test_pos("ábc\n☿\n\n", 7, !IO),
+    test_pos("ábc\n☿\n\n", 8, !IO),
+    test_pos("ábc\n☿\n\n", 9, !IO),
+    io.write_string("--\n", !IO),
     test_err("12 + x-pow(x + 3; y)", expr_top, !IO),
     test_err("abs(x ++ 3)", expr_top, !IO),
     test_err("abs (x))", expr_top, !IO),

--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the reviews mailing list