# Lexical analyzer which tokenizes C language source into pp-tokens.
#
# Author::    Yutaka Yanoh <mailto:yanoh@users.sourceforge.net>
# Copyright:: Copyright (C) 2010-2012, OGIS-RI Co.,Ltd.
# License::   GPLv3+: GNU General Public License version 3 or later
#
# Owner::     Yutaka Yanoh <mailto:yanoh@users.sourceforge.net>

#--
#     ___    ____  __    ___   _________
#    /   |  / _  |/ /   / / | / /__  __/           Source Code Static Analyzer
#   / /| | / / / / /   / /  |/ /  / /                   AdLint - Advanced Lint
#  / __  |/ /_/ / /___/ / /|  /  / /
# /_/  |_|_____/_____/_/_/ |_/  /_/   Copyright (C) 2010-2012, OGIS-RI Co.,Ltd.
#
# This file is part of AdLint.
#
# AdLint is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# AdLint is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# AdLint.  If not, see <http://www.gnu.org/licenses/>.
#
#++

require "adlint/lexer"
require "adlint/report"
require "adlint/util"

module AdLint #:nodoc:
module Cpp #:nodoc:

  class Lexer
    def initialize(source)
      source.on_cr_at_eol_found += lambda { |location|
        on_cr_at_eol_found.invoke(location)
      }
      source.on_eof_mark_at_eof_found += lambda { |location|
        on_eof_mark_at_eof_found.invoke(location)
      }
      source.on_eof_newline_not_found += method(:notify_eof_newline_not_found)

      @content = SourceContent.lazy_new(source)
      @state = Initial.new(self)
      @top_token = nil
    end

    attr_reader :content

    extend Pluggable

    def_plugin :on_block_comment_found
    def_plugin :on_line_comment_found
    def_plugin :on_nested_block_comment_found
    def_plugin :on_unterminated_block_comment
    def_plugin :on_eof_newline_not_found
    def_plugin :on_unlexable_char_found
    def_plugin :on_cr_at_eol_found
    def_plugin :on_eof_mark_at_eof_found

    def next_token
      if @top_token
        token = @top_token
        @top_token = nil
        return token
      end
      @state.next_token
    end

    def top_token
      @top_token ? @top_token : (@top_token = self.next_token)
    end

    def skip_group
      group_depth = 1
      until @content.empty?
        case
        when @content.check(/[ \t]*#[ \t]*(?:if|ifdef|ifndef|asm)\b/)
          group_depth += 1
          @content.scan(/.*\n/)
        when @content.check(/[ \t]*#[ \t]*(?:else|elif)\b/)
          return true if group_depth == 1
          @content.scan(/.*\n/)
        when @content.check(/[ \t]*#[ \t]*(?:endif|endasm)\b/)
          group_depth -= 1
          return true if group_depth == 0
          @content.scan(/.*\n/)
        end
        break unless scan_until_next_directive(@content)
      end
      false
    end

    def transit(next_state)
      @state = next_state
    end

    def notify_block_comment_found(comment, location)
      on_block_comment_found.invoke(comment, location)
    end

    def notify_line_comment_found(comment, location)
      on_line_comment_found.invoke(comment, location)
    end

    def notify_nested_block_comment_found(location)
      on_nested_block_comment_found.invoke(location)
    end

    def notify_unterminated_block_comment(location)
      on_unterminated_block_comment.invoke(location)
    end

    def notify_eof_newline_not_found(location)
      on_eof_newline_not_found.invoke(location)
    end

    def notify_unlexable_char_found(char, location)
      on_unlexable_char_found.invoke(char, location)
    end

    GROUP_DIRECTIVE_RE =
      /.*?(?=^[ \t]*#[ \t]*(?:if|ifdef|ifndef|asm|else|elif|endif|endasm)\b)/m
    private_constant :GROUP_DIRECTIVE_RE

    private
    def scan_until_next_directive(content)
      content.scan(GROUP_DIRECTIVE_RE)
    end
  end

  class LexerState
    def initialize(lexer)
      @lexer = lexer
    end

    def next_token
      subclass_responsibility
    end

    private
    def discard_heading_comments
      discarded = false
      until @lexer.content.empty?
        case
        when @lexer.content.check(/\/\*/)
          location = @lexer.content.location
          comment = scan_block_comment(@lexer.content)
          unless comment.empty?
            @lexer.notify_block_comment_found(comment, location)
            discarded = true
          end
        when @lexer.content.check(/\/\//)
          location = @lexer.content.location
          comment = scan_line_comment(@lexer.content)
          unless comment.empty?
            @lexer.notify_line_comment_found(comment, location)
            discarded = true
          end
        else
          break
        end
      end
      discarded
    end

    def scan_block_comment(content)
      result = ""
      block_depth = 0
      until content.empty?
        location = content.location
        case
        when content.scan(/\/\*/)
          block_depth += 1
          result += "/*"
          @lexer.notify_nested_block_comment_found(location) if block_depth > 1
        when content.scan(/\*\//)
          result += "*/"
          break
        else
          return nil if block_depth == 0
          if comment = content.scan(/.*?(?=\/\*|\*\/)/m)
            result += comment
          else
            @lexer.notify_unterminated_block_comment(location)
          end
        end
      end
      result
    end

    def scan_line_comment(content)
      content.scan(/\/\/.*?(?=\n)/)
    end

    def scan_escaped_newline(content)
      content.scan(/\\[ \t]*\n/)
    end

    def tokenize_pp_token(content)
      location = content.location
      case
      when value = Language::C.scan_keyword(content),
           value = Language::Cpp.scan_keyword(content)
        Token.new(:PP_TOKEN, value, location, Language::C::KEYWORDS[value])
      when value = Language::C.scan_char_constant(content),
           value = Language::C.scan_floating_constant(content),
           value = Language::C.scan_integer_constant(content)
        Token.new(:PP_TOKEN, value, location, :CONSTANT)
      when value = Language::C.scan_string_literal(content)
        Token.new(:PP_TOKEN, value, location, :STRING_LITERAL)
      when value = Language::C.scan_null_constant(content)
        Token.new(:PP_TOKEN, value, location, :NULL)
      when value = Language::C.scan_identifier(content)
        Token.new(:PP_TOKEN, value, location, :IDENTIFIER)
      when value = Language::Cpp.scan_punctuator(content)
        Token.new(:PP_TOKEN, value, location, value)
      else
        nil
      end
    end

    def tokenize_new_line(content)
      location = content.location
      if value = content.scan(/\n/)
        return Token.new(:NEW_LINE, value, location)
      end
      nil
    end

    def tokenize_header_name(content)
      location = content.location
      if value = Language::Cpp.scan_system_header_name(content)
        return Token.new(:SYS_HEADER_NAME, value, location)
      elsif value = Language::Cpp.scan_user_header_name(content)
        return Token.new(:USR_HEADER_NAME, value, location)
      end
      nil
    end

    def tokenize_identifier(content)
      location = content.location
      if value = Language::C.scan_identifier(content)
        return Token.new(:IDENTIFIER, value, location)
      end
      nil
    end

    def tokenize_punctuator(content)
      location = content.location
      if punctuator = Language::Cpp.scan_punctuator(content)
        return Token.new(punctuator, punctuator, location)
      end
      nil
    end
  end

  class Initial < LexerState
    def next_token
      # NOTE: An escaped newline may appear at the line above a preprocessing
      #       directive line.
      while scan_escaped_newline(@lexer.content); end

      case
      when @lexer.content.check(/[ \t]*#/)
        case
        when token = tokenize_if_directive(@lexer.content)
          @lexer.transit(InIfDirective.new(@lexer))
        when token = tokenize_ifdef_directive(@lexer.content)
          @lexer.transit(InIfdefDirective.new(@lexer))
        when token = tokenize_ifndef_directive(@lexer.content)
          @lexer.transit(InIfndefDirective.new(@lexer))
        when token = tokenize_elif_directive(@lexer.content)
          @lexer.transit(InElifDirective.new(@lexer))
        when token = tokenize_else_directive(@lexer.content)
          @lexer.transit(InElseDirective.new(@lexer))
        when token = tokenize_endif_directive(@lexer.content)
          @lexer.transit(InEndifDirective.new(@lexer))
        when token = tokenize_include_directive(@lexer.content)
          @lexer.transit(InIncludeDirective.new(@lexer))
        when token = tokenize_include_next_directive(@lexer.content)
          @lexer.transit(InIncludeNextDirective.new(@lexer))
        when token = tokenize_define_directive(@lexer.content)
          @lexer.transit(InDefineDirective.new(@lexer))
        when token = tokenize_undef_directive(@lexer.content)
          @lexer.transit(InUndefDirective.new(@lexer))
        when token = tokenize_line_directive(@lexer.content)
          @lexer.transit(InLineDirective.new(@lexer))
        when token = tokenize_error_directive(@lexer.content)
          @lexer.transit(InErrorDirective.new(@lexer))
        when token = tokenize_pragma_directive(@lexer.content)
          @lexer.transit(InPragmaDirective.new(@lexer))
        when token = tokenize_asm_directive(@lexer.content)
          @lexer.transit(InAsmDirective.new(@lexer))
        when token = tokenize_endasm_directive(@lexer.content)
          @lexer.transit(InEndasmDirective.new(@lexer))
        else
          token = tokenize_null_directive(@lexer.content)
        end
      else
        token = tokenize_text_line(@lexer.content)
      end

      token
    end

    private
    def tokenize_if_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*if\b/)
        return Token.new(:IF, value, location)
      end
      nil
    end

    def tokenize_ifdef_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*ifdef\b/)
        return Token.new(:IFDEF, value, location)
      end
      nil
    end

    def tokenize_ifndef_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*ifndef\b/)
        return Token.new(:IFNDEF, value, location)
      end
      nil
    end

    def tokenize_elif_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*elif\b/)
        return Token.new(:ELIF, value, location)
      end
      nil
    end

    def tokenize_else_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*else\b/)
        return Token.new(:ELSE, value, location)
      end
      nil
    end

    def tokenize_endif_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*endif\b/)
        return Token.new(:ENDIF, value, location)
      end
      nil
    end

    def tokenize_include_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*include\b/)
        return Token.new(:INCLUDE, value, location)
      end
      nil
    end

    def tokenize_include_next_directive(content)
      # NOTE: #include_next directive is a GCC extension.
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*include_next\b/)
        return Token.new(:INCLUDE_NEXT, value, location)
      end
      nil
    end

    def tokenize_define_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*define\b/)
        return Token.new(:DEFINE, value, location)
      end
      nil
    end

    def tokenize_undef_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*undef\b/)
        return Token.new(:UNDEF, value, location)
      end
      nil
    end

    def tokenize_line_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*line\b/)
        return Token.new(:LINE, value, location)
      end
      nil
    end

    def tokenize_error_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*error\b/)
        return Token.new(:ERROR, value, location)
      end
      nil
    end

    def tokenize_pragma_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*pragma\b/)
        return Token.new(:PRAGMA, value, location)
      end
      nil
    end

    def tokenize_asm_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*asm\b/)
        return Token.new(:ASM, value, location)
      end
      nil
    end

    def tokenize_endasm_directive(content)
      location = content.location
      if value = content.scan(/[ \t]*#[ \t]*endasm\b/)
        return Token.new(:ENDASM, value, location)
      end
      nil
    end

    def tokenize_null_directive(content)
      location = content.location
      value = content.scan(/[ \t]*#/)
      until content.empty?
        if token = content.scan(/.*?(?=\\[ \t]*\n|\n)/)
          value += token
        end

        next if scan_escaped_newline(content)

        if new_line = content.scan(/\n/)
          value += new_line
          break
        end
      end
      Token.new(:NULL_DIRECTIVE, value, location)
    end

    def tokenize_text_line(content)
      location = content.location
      value = ""

      until content.empty?
        if token = content.scan(/.*?(?=\/\*|\/\/|\\[ \t]*\n|L?"|L?'|\n)/i)
          value += token
        end

        if token = content.scan(/\n/)
          value += token
          break
        end

        next if scan_escaped_newline(content)

        case
        when content.check(/\/\*/)
          discard_heading_comments
        when content.check(/\/\//)
          discard_heading_comments
        when content.check(/L?"/i)
          string_literal = Language::C.scan_string_literal(content)
          value += string_literal
        when content.check(/L?'/i)
          char_constant = Language::C.scan_char_constant(content)
          value += char_constant
        end
      end

      value.empty? ? nil : Token.new(:TEXT_LINE, value, location)
    end
  end

  class InIfDirective < LexerState
    def next_token
      until @lexer.content.empty?
        next if discard_heading_comments
        next if scan_escaped_newline(@lexer.content)

        token = tokenize_pp_token(@lexer.content) ||
                tokenize_new_line(@lexer.content)

        if token
          break
        else
          @lexer.content.eat!
        end
      end

      unless token
        token = Token.new(:NEW_LINE, "\n", @lexer.content.location)
        @lexer.notify_eof_newline_not_found(token.location)
      end

      if token.type == :NEW_LINE
        @lexer.transit(Initial.new(@lexer))
      end

      token
    end
  end

  class InIfdefDirective < LexerState
    def next_token
      until @lexer.content.empty?
        next if discard_heading_comments
        next if scan_escaped_newline(@lexer.content)

        token = tokenize_identifier(@lexer.content) ||
                tokenize_new_line(@lexer.content)

        if token
          break
        else
          @lexer.content.eat!
        end
      end

      unless token
        token = Token.new(:NEW_LINE, "\n", @lexer.content.location)
        @lexer.notify_eof_newline_not_found(token.location)
      end

      if token.type == :NEW_LINE
        @lexer.transit(Initial.new(@lexer))
      end

      token
    end
  end

  class InIfndefDirective < InIfdefDirective; end

  class InElifDirective < InIfDirective; end

  class InElseDirective < LexerState
    def next_token
      until @lexer.content.empty?
        next if discard_heading_comments
        next if scan_escaped_newline(@lexer.content)

        if token = tokenize_new_line(@lexer.content)
          break
        else
          @lexer.content.eat!
        end
      end

      unless token
        token = Token.new(:NEW_LINE, "\n", @lexer.content.location)
        @lexer.notify_eof_newline_not_found(token.location)
      end

      if token.type == :NEW_LINE
        @lexer.transit(Initial.new(@lexer))
      end

      token
    end
  end

  class InEndifDirective < InElseDirective; end

  class InIncludeDirective < LexerState
    def next_token
      until @lexer.content.empty?
        next if discard_heading_comments
        next if scan_escaped_newline(@lexer.content)

        token = tokenize_header_name(@lexer.content) ||
                tokenize_pp_token(@lexer.content)    ||
                tokenize_new_line(@lexer.content)

        if token
          break
        else
          @lexer.content.eat!
        end
      end

      unless token
        token = Token.new(:NEW_LINE, "\n", @lexer.content.location)
        @lexer.notify_eof_newline_not_found(token.location)
      end

      if token.type == :NEW_LINE
        @lexer.transit(Initial.new(@lexer))
      end

      token
    end
  end

  class InIncludeNextDirective < InIncludeDirective; end

  class InDefineDirective < LexerState
    def initialize(lexer)
      super
      @tokens = []
    end

    def next_token
      if @tokens.empty?
        tokenize_macro_name(@lexer.content)
        tokenize_pp_tokens(@lexer.content)
      end

      token = @tokens.shift
      @lexer.transit(Initial.new(@lexer)) if @tokens.empty?
      token
    end

    private
    def tokenize_macro_name(content)
      until content.empty?
        next if discard_heading_comments
        if token = tokenize_identifier(content)
          @tokens.push(token)
          break
        else
          content.eat!
        end
      end

      return unless content.check(/\(/)

      paren_depth = 0
      until content.empty?
        next if discard_heading_comments
        next if scan_escaped_newline(content)

        if token = tokenize_identifier(content)
          @tokens.push(token)
          next
        end

        if token = tokenize_punctuator(content)
          @tokens.push(token)
          case token.type
          when "("
            paren_depth += 1
          when ")"
            paren_depth -= 1
            break if paren_depth == 0
          end
          next
        end

        if token = tokenize_new_line(content)
          @tokens.push(token)
          break
        end

        content.eat!
      end
    end

    def tokenize_pp_tokens(content)
      until content.empty?
        next if discard_heading_comments
        next if scan_escaped_newline(content)

        token = tokenize_pp_token(content) || tokenize_new_line(content)

        if token
          @tokens.push(token)
          if token.type == :NEW_LINE
            break
          end
        else
          location = content.location
          if eaten = content.eat! and eaten !~ /\A\s\z/
            @lexer.notify_unlexable_char_found(eaten, location)
          end
        end
      end

      unless @tokens.last && @tokens.last.type == :NEW_LINE
        token = Token.new(:NEW_LINE, "\n", content.location)
        @lexer.notify_eof_newline_not_found(token.location)
        @tokens.push(token)
      end
    end
  end

  class InUndefDirective < InDefineDirective; end

  class InLineDirective < InIfDirective; end

  class InErrorDirective < InLineDirective; end

  class InPragmaDirective < InLineDirective; end

  class InAsmDirective < InElseDirective; end

  class InEndasmDirective < InElseDirective; end

  class StringToPPTokensLexer < StringLexer
    private
    def create_context(str)
      context = LexerContext.new(create_content(str))

      class << context
        attr_accessor :last_symbol
      end

      context
    end

    def create_content(str)
      StringContent.new(str)
    end

    def tokenize(context)
      token_array = TokenArray.new
      until context.content.empty?
        next if tokenize_pp_token(context, token_array)

        location = context.location
        if new_line = context.content.scan(/\n/)
          token_array.push(Token.new(:NEW_LINE, new_line, location))
          break
        else
          context.content.eat!
        end
      end
      token_array
    end

    def tokenize_pp_token(context, token_array)
      pp_token = tokenize_keyword(context)        ||
                 tokenize_constant(context)       ||
                 tokenize_string_literal(context) ||
                 tokenize_null_constant(context)  ||
                 tokenize_identifier(context)     ||
                 tokenize_punctuator(context)

      if pp_token
        token_array.push(pp_token)
        return true
      end

      false
    end

    def tokenize_keyword(context)
      location = context.location

      keyword = Language::C.scan_keyword(context.content) ||
                Language::Cpp.scan_keyword(context.content)

      if keyword
        context.last_symbol = :KEYWORD
        Token.new(:PP_TOKEN, keyword, location, Language::C::KEYWORDS[keyword])
      else
        nil
      end
    end

    def tokenize_constant(context)
      location = context.location

      # NOTE: For extended bit-access operators.
      return nil if context.last_symbol == :IDENTIFIER

      constant = Language::C.scan_char_constant(context.content)     ||
                 Language::C.scan_floating_constant(context.content) ||
                 Language::C.scan_integer_constant(context.content)

      if constant
        context.last_symbol = :CONSTANT
        return Token.new(:PP_TOKEN, constant, location, :CONSTANT)
      end

      nil
    end

    def tokenize_string_literal(context)
      location = context.location

      string_literal = Language::C.scan_string_literal(context.content)

      if string_literal
        context.last_symbol = :STRING_LITERAL
        return Token.new(:PP_TOKEN, string_literal, location, :STRING_LITERAL)
      end

      nil
    end

    def tokenize_null_constant(context)
      location = context.location

      null_constant = Language::C.scan_null_constant(context.content)

      if null_constant
        context.last_symbol = :NULL
        return Token.new(:PP_TOKEN, null_constant, location, :NULL)
      end

      nil
    end

    def tokenize_identifier(context)
      location = context.location

      identifier = Language::C.scan_identifier(context.content)

      if identifier
        context.last_symbol = :IDENTIFIER
        return Token.new(:PP_TOKEN, identifier, location, :IDENTIFIER)
      end

      nil
    end

    def tokenize_punctuator(context)
      location = context.location

      punctuator = Language::Cpp.scan_punctuator(context.content)

      if punctuator
        context.last_symbol = :PUNCTUATOR
        return Token.new(:PP_TOKEN, punctuator, location, punctuator)
      end

      nil
    end
  end

  class TextLineToPPTokensLexer < StringToPPTokensLexer
    def initialize(text_line)
      super(text_line.token.value)
      @text_line = text_line
    end

    private
    def create_content(str)
      StringContent.new(str,
                        @text_line.location.fpath,
                        @text_line.location.line_no,
                        @text_line.location.column_no)
    end
  end

end
end
