11#
2- # This is a simple lexer for the C programming language.
2+ # This is a simple lexer for the C programming language.
33# MIT license. (c) 2023 Pascal Bourguignon
44#
55
66class CLexer
7+ #
8+ # CLexer is a simple C lexer. It is used to tokenize a C source file.
9+ #
10+ # Usage:
11+ # lexer = CLexer.new(pre_processed_c_source)
12+ # tokens = lexer.tokenize
13+ #
14+ # The tokenize method returns an array of tokens.
715
816 KEYWORDS = %w[ auto break case char const continue default do double else enum
917 extern float for goto if int long register return short signed
@@ -67,22 +75,21 @@ class CLexer
6775 '{' => :open_brace ,
6876 '|' => :logical_or_op ,
6977 '}' => :close_brace ,
70- '~' => :bitwise_not_op ,
78+ '~' => :bitwise_not_op
7179
7280 } . freeze
7381
74-
7582 OPERATOR_REGEX = Regexp . new ( '\A(' + OPERATOR_SYMBOLS . keys . map { |op | Regexp . escape ( op ) } . join ( '|' ) + ')' )
7683 OPERATOR_SYMS = OPERATOR_SYMBOLS . values . freeze
77- KEYWORDS_SYMS = KEYWORDS . map { | n | n . to_sym } . freeze
84+ KEYWORDS_SYMS = KEYWORDS . map ( & : to_sym) . freeze
7885
7986 def initialize ( input )
8087 @input = input
8188 @tokens = [ ]
8289 end
8390
8491 def tokenize
85- while @input . size > 0
92+ while @input . size . positive?
8693 case @input
8794 when /\A [[:space:]]+/m
8895 @input = $'
@@ -91,35 +98,35 @@ def tokenize
9198 when /\A \/ \* /
9299 consume_multiline_comment
93100 when /\A [_a-zA-Z][_a-zA-Z0-9]*/
94- identifier_or_keyword = $& ;
101+ identifier_or_keyword = $&
95102 @input = $'
96103 if KEYWORDS . include? ( identifier_or_keyword )
97104 @tokens << identifier_or_keyword . to_sym
98105 else
99106 @tokens << [ :identifier , identifier_or_keyword ]
100107 end
101108 when /\A \d +\. \d *([eE][+-]?\d +)?[fFlL]?|\. \d +([eE][+-]?\d +)?[fFlL]?|\d +[eE][+-]?\d +[fFlL]?/
102- float_constant = $& ;
109+ float_constant = $&
103110 @input = $'
104111 @tokens << [ :float_literal , float_constant ]
105112 when /\A \d +/
106- integer_constant = $& ;
113+ integer_constant = $&
107114 @input = $'
108115 @tokens << [ :integer_literal , integer_constant ]
109116 when /\A 0[xX][0-9a-fA-F]+/
110- hex_constant = $& ;
117+ hex_constant = $&
111118 @input = $'
112119 @tokens << [ :hex_literal , hex_constant ]
113120 when /\A '((\\ .|[^\\ '])*)'/
114- char_literal = $& ;
121+ char_literal = $&
115122 @input = $'
116123 @tokens << [ :char_literal , char_literal ]
117124 when /\A "((\\ .|[^\\ "])*)"/
118- string_literal = $& ;
125+ string_literal = $&
119126 @input = $'
120127 @tokens << [ :string_literal , string_literal ]
121128 when OPERATOR_REGEX
122- operator = $& ;
129+ operator = $&
123130 @input = $'
124131 @tokens << OPERATOR_SYMBOLS [ operator ]
125132 else
@@ -133,7 +140,7 @@ def tokenize
133140 private
134141
135142 def consume_multiline_comment
136- while @input . size > 0
143+ while @input . size . positive?
137144 case @input
138145 when /\A \* \/ /
139146 @input = $'
@@ -145,8 +152,8 @@ def consume_multiline_comment
145152 end
146153end
147154
148- def example
149- input = File . read ( "/home/pbourguignon/src/c-tidbits/pipes/ tee.out.c" )
155+ def example
156+ input = File . read ( ' tee.c' )
150157 lexer = CLexer . new ( input )
151158 tokens = lexer . tokenize
152159 puts tokens . inspect
0 commit comments