You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
426 lines
16 KiB
426 lines
16 KiB
#Region "Microsoft.VisualBasic::cf74eccb4985e74bbad7865a4dce631f, LINQ\TestMain\Parser\Tokenizer.vb"
|
|
|
|
' Author:
|
|
'
|
|
' asuka (amethyst.asuka@gcmodeller.org)
|
|
' xie (genetics@smrucc.org)
|
|
' xieguigang (xie.guigang@live.com)
|
|
'
|
|
' Copyright (c) 2018 GPL3 Licensed
|
|
'
|
|
'
|
|
' GNU GENERAL PUBLIC LICENSE (GPL3)
|
|
'
|
|
'
|
|
' This program is free software: you can redistribute it and/or modify
|
|
' it under the terms of the GNU General Public License as published by
|
|
' the Free Software Foundation, either version 3 of the License, or
|
|
' (at your option) any later version.
|
|
'
|
|
' This program is distributed in the hope that it will be useful,
|
|
' but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
' GNU General Public License for more details.
|
|
'
|
|
' You should have received a copy of the GNU General Public License
|
|
' along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
' /********************************************************************************/
|
|
|
|
' Summaries:
|
|
|
|
' Class Tokenizer
|
|
'
|
|
' Properties: Current, IsChar, IsComma, IsDot, IsInvalid
|
|
' IsNumber, IsOperator, IsSpace
|
|
'
|
|
' Constructor: (+1 Overloads) Sub New
|
|
'
|
|
' Function: GetNextToken, GetNumber, GetOperator, GetString
|
|
'
|
|
' Sub: MoveNext
|
|
'
|
|
'
|
|
' /********************************************************************************/
|
|
|
|
#End Region
|
|
|
|
Imports System.CodeDom
|
|
Imports System.Text
|
|
|
|
Namespace Parser
|
|
|
|
''' <summary>
|
|
''' Divides the string into tokens.
|
|
''' </summary>
|
|
Public Class Tokenizer
|
|
Private _En As CharEnumerator
|
|
Private _IsInvalid As Boolean = False
|
|
Private _PrevToken As Token = Token.NullToken
|
|
|
|
''' <summary>
|
|
''' A tokenizer is always constructed on a single string. Create one tokenizer per string.
|
|
''' </summary>
|
|
''' <param name="s">string to tokenize</param>
|
|
Public Sub New(s As String)
|
|
_En = s.GetEnumerator()
|
|
MoveNext()
|
|
End Sub
|
|
|
|
''' <summary>
|
|
''' Moves to the next character. If there are no more characters, then the tokenizer is
|
|
''' invalid.
|
|
''' </summary>
|
|
Private Sub MoveNext()
|
|
If Not _En.MoveNext() Then
|
|
_IsInvalid = True
|
|
End If
|
|
End Sub
|
|
|
|
''' <summary>
|
|
''' Allows access to the token most recently parsed.
|
|
''' </summary>
|
|
Public ReadOnly Property Current() As Token
|
|
Get
|
|
Return _PrevToken
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Indicates that there are no more characters in the string and tokenizer is finished.
|
|
''' </summary>
|
|
Public ReadOnly Property IsInvalid() As Boolean
|
|
Get
|
|
Return _IsInvalid
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Is the current character a letter or underscore?
|
|
''' </summary>
|
|
Public ReadOnly Property IsChar() As Boolean
|
|
Get
|
|
If _IsInvalid Then
|
|
Return False
|
|
End If
|
|
Return ((_En.Current >= "A"c AndAlso _En.Current <= "Z"c) OrElse (_En.Current >= "a"c AndAlso _En.Current <= "z"c) OrElse _En.Current = "_"c)
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Is the current character a dot (".")?
|
|
''' </summary>
|
|
Public ReadOnly Property IsDot() As Boolean
|
|
Get
|
|
If _IsInvalid Then
|
|
Return False
|
|
End If
|
|
Return _En.Current = "."c
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Is the current character a comma?
|
|
''' </summary>
|
|
Public ReadOnly Property IsComma() As Boolean
|
|
Get
|
|
If _IsInvalid Then
|
|
Return False
|
|
End If
|
|
Return _En.Current = ","c
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Is the current character a number?
|
|
''' </summary>
|
|
Public ReadOnly Property IsNumber() As Boolean
|
|
Get
|
|
If _IsInvalid Then
|
|
Return False
|
|
End If
|
|
Return (_En.Current >= "0"c AndAlso _En.Current <= "9"c)
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Is the current character a whitespace character?
|
|
''' </summary>
|
|
Public ReadOnly Property IsSpace() As Boolean
|
|
Get
|
|
If _IsInvalid Then
|
|
Return False
|
|
End If
|
|
Return (_En.Current = " "c OrElse _En.Current = ControlChars.Tab)
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Is the current character an operator?
|
|
''' </summary>
|
|
Public ReadOnly Property IsOperator() As Boolean
|
|
Get
|
|
If _IsInvalid Then
|
|
Return False
|
|
End If
|
|
Select Case _En.Current
|
|
Case ">"c, "<"c, "="c, "-"c, "+"c, "!"c, _
|
|
"/"c, "%"c, "*"c, "&"c, "|"c, "("c, _
|
|
")"c, "["c, "]"c, """"c
|
|
Return True
|
|
Case Else
|
|
Return False
|
|
End Select
|
|
End Get
|
|
End Property
|
|
|
|
''' <summary>
|
|
''' Gets the next token in the string. Reads as many characters as necessary to retrieve
|
|
''' that token.
|
|
''' </summary>
|
|
''' <returns>next token</returns>
|
|
Public Function GetNextToken() As Token
|
|
If _IsInvalid Then
|
|
Return Token.NullToken
|
|
End If
|
|
|
|
Dim token__1 As Token
|
|
If IsChar Then
|
|
token__1 = GetString()
|
|
ElseIf IsComma Then
|
|
token__1 = New Token(",", TokenType.Comma, TokenPriority.None)
|
|
MoveNext()
|
|
ElseIf IsDot Then
|
|
token__1 = New Token(".", TokenType.Dot, TokenPriority.None)
|
|
MoveNext()
|
|
ElseIf IsNumber Then
|
|
token__1 = GetNumber()
|
|
ElseIf IsSpace Then
|
|
' Eat space and do recursive call.
|
|
MoveNext()
|
|
token__1 = GetNextToken()
|
|
ElseIf IsOperator Then
|
|
token__1 = GetOperator()
|
|
Else
|
|
token__1 = Token.NullToken
|
|
MoveNext()
|
|
End If
|
|
|
|
_PrevToken = token__1
|
|
Return token__1
|
|
End Function
|
|
|
|
''' <summary>
|
|
''' Anything that starts with a character is considered a string. This could be a
|
|
''' primitive quoted string, a primitive expression, or an identifier
|
|
''' </summary>
|
|
''' <returns></returns>
|
|
Private Function GetString() As Token
|
|
' Handle empty strings
|
|
If _PrevToken.Type = TokenType.Quote AndAlso _En.Current = """"c Then
|
|
MoveNext()
|
|
Return New Token(String.Empty, TokenType.Primitive, TokenPriority.None)
|
|
End If
|
|
Dim sb As New StringBuilder()
|
|
sb.Append(_En.Current)
|
|
While True
|
|
If _IsInvalid Then
|
|
Exit While
|
|
End If
|
|
MoveNext()
|
|
If _IsInvalid Then
|
|
Exit While
|
|
End If
|
|
|
|
If IsChar Then
|
|
sb.Append(_En.Current)
|
|
ElseIf IsNumber Then
|
|
sb.Append(_En.Current)
|
|
Else
|
|
If _PrevToken.Type = TokenType.Quote Then
|
|
If _En.Current = """"c Then
|
|
MoveNext()
|
|
Exit While
|
|
ElseIf _En.Current = "\"c Then
|
|
' In the case of \, we'll add that character and whatever character follows it.
|
|
sb.Append(_En.Current)
|
|
MoveNext()
|
|
If Not _IsInvalid Then
|
|
sb.Append(_En.Current)
|
|
End If
|
|
Else
|
|
sb.Append(_En.Current)
|
|
End If
|
|
Else
|
|
Exit While
|
|
End If
|
|
End If
|
|
End While
|
|
Dim s As String = sb.ToString()
|
|
|
|
' "false" or "true" is a primitive expression.
|
|
If s = "false" OrElse s = "true" Then
|
|
Return New Token([Boolean].Parse(s), TokenType.Primitive, TokenPriority.None)
|
|
End If
|
|
|
|
' The previous token was a quote, so this is a primitive string.
|
|
If _PrevToken.Type = TokenType.Quote Then
|
|
Return New Token(s, TokenType.Primitive, TokenPriority.None)
|
|
End If
|
|
|
|
' The default is that the string indicates an identifier.
|
|
Return New Token(s, TokenType.Identifier, TokenPriority.None)
|
|
End Function
|
|
|
|
''' <summary>
|
|
''' A token that starts with a number can be an integer, a long, or a double.
|
|
''' </summary>
|
|
''' <returns></returns>
|
|
''' <remarks>
|
|
''' An integer is the default for numbers. Numbers can also be followed by a
|
|
''' l, L, d, or D character to indicate a long or a double value respectively.
|
|
''' Any numbers containing a dot (".") are considered doubles.
|
|
''' </remarks>
|
|
Private Function GetNumber() As Token
|
|
Dim sb As New StringBuilder()
|
|
sb.Append(_En.Current)
|
|
Dim isDouble As Boolean = False
|
|
Dim isLong As Boolean = False
|
|
Dim cont As Boolean = True
|
|
While cont
|
|
If _IsInvalid Then
|
|
Exit While
|
|
End If
|
|
MoveNext()
|
|
If _IsInvalid Then
|
|
Exit While
|
|
End If
|
|
|
|
If IsNumber Then
|
|
sb.Append(_En.Current)
|
|
ElseIf IsChar Then
|
|
Select Case _En.Current
|
|
Case "D"c, "d"c
|
|
isDouble = True
|
|
MoveNext()
|
|
If IsChar OrElse IsNumber Then
|
|
sb.Append(_En.Current)
|
|
Throw New ArgumentException("Invalid number: " & sb.ToString())
|
|
Else
|
|
cont = False
|
|
End If
|
|
Case "L"c, "l"c
|
|
isLong = True
|
|
MoveNext()
|
|
If IsChar OrElse IsNumber Then
|
|
sb.Append(_En.Current)
|
|
Throw New ArgumentException("Invalid number: " & sb.ToString())
|
|
Else
|
|
cont = False
|
|
End If
|
|
Case Else
|
|
sb.Append(_En.Current)
|
|
Throw New ArgumentException("Invalid number: " & sb.ToString())
|
|
End Select
|
|
ElseIf IsDot Then
|
|
sb.Append(_En.Current)
|
|
If isDouble Then
|
|
' The number has already been marked as a double, which means it already
|
|
' contains a number.
|
|
Throw New ArgumentException("Invalid number: " & sb.ToString())
|
|
Else
|
|
isDouble = True
|
|
End If
|
|
Else
|
|
Exit While
|
|
End If
|
|
End While
|
|
Dim s As String = sb.ToString()
|
|
If isLong Then
|
|
Return New Token(Int64.Parse(s), TokenType.Primitive, TokenPriority.None)
|
|
End If
|
|
If isDouble Then
|
|
Return New Token([Double].Parse(s), TokenType.Primitive, TokenPriority.None)
|
|
End If
|
|
Return New Token(Int32.Parse(s), TokenType.Primitive, TokenPriority.None)
|
|
End Function
|
|
|
|
''' <summary>
|
|
''' Some operators take more than one character. Also, the tokenizer is able to
|
|
''' categorize the token's priority based on what kind of operator it is.
|
|
''' </summary>
|
|
''' <returns></returns>
|
|
Private Function GetOperator() As Token
|
|
Dim op As New String(_En.Current, 1)
|
|
Select Case _En.Current
|
|
Case "<"c, "="c, ">"c
|
|
MoveNext()
|
|
If _En.Current = "="c Then
|
|
op += _En.Current
|
|
MoveNext()
|
|
End If
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.Equality)
|
|
Case "-"c
|
|
MoveNext()
|
|
If _PrevToken.Type = TokenType.Primitive OrElse _PrevToken.Type = TokenType.Identifier OrElse _PrevToken.Type = TokenType.CloseParens Then
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.PlusMinus)
|
|
Else
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.UnaryMinus)
|
|
End If
|
|
Case "+"c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.PlusMinus)
|
|
Case "!"c
|
|
MoveNext()
|
|
If _En.Current = "="c Then
|
|
op += _En.Current
|
|
MoveNext()
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.Equality)
|
|
Else
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.[Not])
|
|
End If
|
|
Case "*"c, "/"c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.MulDiv)
|
|
Case "%"c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.[Mod])
|
|
Case "|"c
|
|
MoveNext()
|
|
If _En.Current = "|"c Then
|
|
op += _En.Current
|
|
MoveNext()
|
|
End If
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.[Or])
|
|
Case "&"c
|
|
MoveNext()
|
|
If _En.Current = "&"c Then
|
|
op += _En.Current
|
|
MoveNext()
|
|
End If
|
|
Return New Token(op, TokenType.[Operator], TokenPriority.[And])
|
|
Case "("c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.OpenParens, TokenPriority.None)
|
|
Case ")"c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.CloseParens, TokenPriority.None)
|
|
Case "["c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.OpenBracket, TokenPriority.None)
|
|
Case "]"c
|
|
MoveNext()
|
|
Return New Token(op, TokenType.CloseBracket, TokenPriority.None)
|
|
Case """"c
|
|
' When we detect a quote, we can just ignore it since the user doesn't really need to know about it.
|
|
MoveNext()
|
|
_PrevToken = New Token(op, TokenType.Quote, TokenPriority.None)
|
|
Return GetString()
|
|
End Select
|
|
Return Token.NullToken
|
|
End Function
|
|
End Class
|
|
End Namespace
|