From 4ed628aca1dc9298e231f6d02db9e81989316eb7 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Wed, 26 Feb 2025 17:29:30 -0600 Subject: [PATCH 01/12] Initial grid table filter --- lib/banzai/filter/grid_table_filter.rb | 531 ++++++++++++++++++ .../pipeline/plain_markdown_pipeline.rb | 1 + 2 files changed, 532 insertions(+) create mode 100644 lib/banzai/filter/grid_table_filter.rb diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb new file mode 100644 index 00000000000000..1998287fd3efa4 --- /dev/null +++ b/lib/banzai/filter/grid_table_filter.rb @@ -0,0 +1,531 @@ +# frozen_string_literal: true + +# +# GridTableFilter.rb +# +# (c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# + +# TODO: This is now a legacy filter, and is only used with the Ruby parser. +# The current markdown parser now properly handles grid table blocks. +# issue: https://gitlab.com/gitlab-org/gitlab/-/issues/460864 +# GridTableFilter.rb +# +# Converts Pandoc-style grid tables to HTML tables with rowspan and colspan support +# + +module Banzai + module Filter + class GridTableFilter < HTML::Pipeline::TextFilter + MARKDOWN_GRID_TABLE_BLOCK_REGEX = %r{ + (? + # Grid table blocks: + # +---+---+---+---+ + # Anything, starting with | blocks which are ignored by this filter + # +---+---+---+---+ + + ^\s*\+-.*\+\s$ # First separator line + (?:.*\n)*? # Any number of rows (non-greedy) + \s*\+-.*\+\s$ # Last separator line + ) + }mx + + require 'logger' + + # Add these regex constants at the top of the file, after the require statement + GRID_TABLE_SEPARATOR = /\s*\+([-:=]+\+)+\s*$/ + GRID_TABLE_HEADER_SEPARATOR = /.*\+([=:]+\+)+.*$/ + GRID_TABLE_BODY_SEPARATOR = /.*\+([:-]+\+)+.*$/ + GRID_TABLE_BODY_SEPARATOR_LINE = /[-:]+$/ + + class Cell + attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position, :list_flag + + def initialize + @content = nil + @rowspan = 0 + @colspan = 0 + @colspan_adjusted = false + @alignment = 'align="center"' + @position = nil + @list_flag = false + end + + def set_alignment(default_alignments, header_delimiter_positions) + header_delimiter_index = 0 + + while header_delimiter_index < default_alignments.length && + @position > header_delimiter_positions[header_delimiter_index] + header_delimiter_index += 1 + end + + raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length + + if @position < header_delimiter_positions[header_delimiter_index] + @alignment = default_alignments[header_delimiter_index] + elsif @position == header_delimiter_positions[header_delimiter_index] + @alignment = default_alignments[header_delimiter_index] + header_delimiter_index + 1 + end + end + end + + class Row + attr_accessor :cells + + def initialize(length = 1) + @cells = Array.new(length) { Cell.new } + end + + def [](index) + @cells[index] + end + + def []=(index, value) + @cells[index] = value + end + end + + class RowTracker + attr_accessor :row_tracker + + def initialize(items) + @row_tracker = Array.new(items, 0) + end + + def [](index) + @row_tracker[index] + end + + def []=(index, value) + @row_tracker[index] = value + end + end + + # Helper method to detect separator lines + def separator?(line) + GRID_TABLE_SEPARATOR.match?(line) + end + + # Helper method to handle content in cells + def handling_content(cell, content) + if cell.content.nil? + cell.rowspan += 1 + cell.colspan += 1 + if content.strip.start_with?("- ") # List + cell.list_flag = true + cell.content = "#{content.strip}\n" + elsif cell.list_flag && !content.strip.empty? + cell.content += "#{content.strip}\n" + elsif content.strip == "" + cell.list_flag = false + cell.content = "\n" + else + cell.content = content.strip.gsub(/\\\s*$/, "\n") + end + elsif content.strip.start_with?("- ") + cell.content += "\n" unless cell.list_flag + + cell.list_flag = true + cell.content += "#{content.strip}\n" + elsif cell.list_flag && !content.strip.empty? + cell.content = cell.content.strip.chomp("\n") + cell.content += " #{content.strip}\n" + elsif content.strip.empty? + cell.list_flag = false + cell.content += cell.content.end_with?("\n") ? "" : "\n" + else + content = content.strip.gsub(/\\\s*$/, "\n") + cell.content += " #{content}" + end + + cell + end + + # Helper method to adjust colspan + def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions) + (column_index...number_of_parts).each do |j| + delimiter_start = nil + col_i = column_index + + until delimiter_start + delimiter_start = col_i > 0 ? row[col_i - 1].position : 0 + col_i -= 1 + end + + delimiters = ['|', '+'] + positions = delimiters.filter_map do |delimiter| + pos = line[delimiter_start + 1..]&.index(delimiter) + pos ? pos + delimiter_start + 1 : nil + end + + position = positions.min + + if position && position > delimiter_positions[j] + row[column_index].colspan += 1 + + if position == delimiter_positions[-1] + colspan_allocated = row[column_index].colspan + row[column_index].colspan += number_of_columns - colspan_allocated - column_index + end + elsif position && position < delimiter_positions[j] + raise "Wrong cell formatting" + else + break + end + end + + row[column_index] + end + + # rubocop:disable Metrics/AbcSize -- PoC + # rubocop:disable Metrics/CyclomaticComplexity -- PoC + # rubocop:disable Metrics/PerceivedComplexity -- PoC + def parse_pandoc_table_with_spans(pandoc_table) + # Split the input into lines + lines = pandoc_table.strip.split("\n").map(&:strip) + + separator_indices = lines.each_index.select { |i| separator?(lines[i]) } + + raise "No valid separators found in the provided Pandoc table." if separator_indices.empty? + + # Calculate max number of columns and delimiter positions + delimiter_positions = [] + number_of_columns = separator_indices.map { |i| lines[i].count("+") - 1 }.max + separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns } + + number_of_columns.times do |j| + start_pos = j == 0 ? 0 : delimiter_positions[j - 1] + pos = lines[separator_index_max_columns][start_pos + 1..]&.index("+") + delimiter_positions << (pos ? pos + start_pos + 1 : -1) + end + + # Process header + has_header = false + header_delimiter_positions = [] + default_alignments = [] + header_rows = [] + header_separator_index = nil + + separator_indices.each do |index| + next unless GRID_TABLE_HEADER_SEPARATOR.match?(lines[index]) + + has_header = true + header_separator_index = index + parts = lines[index].strip.delete_prefix("+").split("+") + + parts.each_with_index do |part, part_index| + default_alignments << if part.start_with?(":") && !part.end_with?(":") + 'align="left"' + elsif !part.start_with?(":") && part.end_with?(":") + 'align="right"' + else + 'align="center"' + end + + start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] + pos = lines[index][start_pos + 1..]&.index("+") + header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) + end + break + end + + # Process table body + data_rows = [] + + (separator_indices.length - 1).times do |row| + rows = [] + rows_tracker = nil + in_data_row = false + start = separator_indices[row] + end_idx = separator_indices[row + 1] + row_lines = lines[start...end_idx] + + next if row_lines.empty? + + row_lines.each do |line| + if separator?(line) && !in_data_row + in_data_row = true + parts = line.strip.delete_prefix("+").split("+") + delimiter_index = 0 + rows << Row.new(number_of_columns) + rows_tracker = RowTracker.new(number_of_columns) + + i = 0 + parts.each_with_index do |_, j| + next unless i < number_of_columns + + delimiter_index += parts[j].length + 1 + rows[-1][i].position = delimiter_index + rows[-1][i].set_alignment(default_alignments, header_delimiter_positions) + + i += 1 while delimiter_index > delimiter_positions[i] + i += 1 + end + elsif in_data_row + if GRID_TABLE_BODY_SEPARATOR.match?(line) + cells_content = line.strip + .delete_prefix("|") + .delete_prefix("+") + .delete_suffix("|") + .delete_suffix("+") + .split(/[\|\+]/) + + rows << Row.new(number_of_columns) + aux_delimiter_index = 0 + auxiliar_cell_index = 0 + + cells_content.each_with_index do |_, i| + next unless auxiliar_cell_index < number_of_columns + + aux_delimiter_index += cells_content[i].length + 1 + rows[-1][auxiliar_cell_index].position = aux_delimiter_index + rows[-1][auxiliar_cell_index].set_alignment(default_alignments, header_delimiter_positions) + + auxiliar_cell_index += 1 while aux_delimiter_index > delimiter_positions[auxiliar_cell_index] + + auxiliar_cell_index += 1 + end + + raise "More cells than columns found" unless cells_content.length <= number_of_columns + + column_index = 0 + + cells_content.each_with_index do |content, _i| + if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) + rows_tracker[column_index] += 1 + rows[rows_tracker[column_index]][column_index].list_flag = false + + column_forward = 0 + (column_index...delimiter_positions.length).each do |del_index| + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index] + column_forward += 1 + rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 + end + end + + column_index += column_forward + else + rows[rows_tracker[column_index]][column_index] = + handling_content(rows[rows_tracker[column_index]][column_index], content) + rows[rows_tracker[column_index]][column_index].rowspan += 1 + + unless rows[rows_tracker[column_index]][column_index].colspan_adjusted + rows[rows_tracker[column_index]][column_index].colspan_adjusted = true + rows[rows_tracker[column_index]][column_index] = + adjust_colspan(rows[rows_tracker[column_index]], + column_index, + number_of_columns, + line, + number_of_columns, + delimiter_positions) + end + + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index] + colspan = rows[rows_tracker[column_index]][column_index].colspan + column_index += (colspan == 0 ? 1 : colspan) # rubocop:disable Metrics/BlockNesting -- PoC + end + end + end + + else + cells_content = line.strip.delete_prefix("|").split(/\s*\|\s*/) + column_index = 0 + + if cells_content.length < number_of_columns + cells_content.each_with_index do |content, _i| + rows[rows_tracker[column_index]][column_index] = + handling_content(rows[rows_tracker[column_index]][column_index], content) + + unless rows[rows_tracker[column_index]][column_index].colspan_adjusted + rows[rows_tracker[column_index]][column_index].colspan_adjusted = true + rows[rows_tracker[column_index]][column_index] = + adjust_colspan(rows[rows_tracker[column_index]], + column_index, + number_of_columns, + line, + number_of_columns, + delimiter_positions) + end + + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index] + column_index += rows[rows_tracker[column_index]][column_index].colspan + end + end + elsif cells_content.length == number_of_columns + cells_content.each_with_index do |content, i| + rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], content) + end + else + raise "More cells than columns found" + end + end + else + raise "No separator line found for row starting" + end + end + + if has_header && start >= header_separator_index + rows.each { |body_row| data_rows << body_row.cells } + elsif has_header && start < header_separator_index + rows.each { |header_row| header_rows << header_row.cells } + end + end + + raise "No valid rows found in the provided Pandoc table." if data_rows.empty? && header_rows.empty? + + # Format text (bold and italic) + [header_rows, data_rows].each do |rows| + rows.each do |row| + row.each do |cell| + next if cell.content.nil? + + delimters = ['**', '__'] + delimters.each do |bold_chars| + while cell.content.include?(bold_chars) + cell.content = cell.content.sub(bold_chars, "") + .sub(bold_chars, "") + end + end + + while cell.content.include?("_") && cell.content.exclude?("\\_") + cell.content = cell.content.rstrip.sub("_", "").sub("_", "") + end + + cell.content = cell.content.rstrip.sub("\\_", "_") while cell.content.include?("\\_") + + # Convert newlines to HTML breaks + cell.content = cell.content&.gsub("\n", "
") + end + end + + # Validate grid correctness + forward_rowspan = [] + + rows.each_with_index do |row, row_index| + forward_rowspan = Array.new(row.length, 0) if forward_rowspan.empty? + sum = 0 + + row.each_with_index do |cell, cell_index| + sum += cell.colspan + + if row_index > 0 && cell.colspan == 0 + sum += 1 if forward_rowspan[cell_index] > 0 + + forward_rowspan[cell_index] -= 1 + end + + forward_rowspan[cell_index] = cell.rowspan - 1 if forward_rowspan[cell_index] == 0 && cell.rowspan > 1 + end + + raise "Grid table not converted properly" unless sum == number_of_columns + end + end + + [header_rows, data_rows] + end + + def generate_html_table_with_spans(pandoc_table) + grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) + rescue StandardError => e + logger = Logger.new($stdout) + logger.error("Grid table could not be generated: #{e.message}") + + "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" + else + html = "\n" + has_header = false + + grid_header.each do |row| + row.each do |cell| + if cell.rowspan != 0 && cell.colspan != 0 + has_header = true + break + end + end + end + + if has_header + html += " \n" + grid_header.each do |row| + html += " \n" + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + # Prepare content, in case there's a list + matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) + if matches + list = "" + cell.content = cell.content.gsub(%r{(\s*([-*+]|\d+\.)\s+[^<]+?
)+}, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'align="left"' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %( #{cell.content}\n) + end + html += " \n" + end + html += " \n" + end + + html += " \n" + grid_body.each do |row| + html += " \n" + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) + if matches + list = "" + cell.content = cell.content.gsub(%r{(\s*([-*+]|\d+\.)\s+[^<]+?
)+}, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'align="left"' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %( #{cell.content}\n) + end + html += " \n" + end + + html += " \n" + html += "
" + html + end + # rubocop:enable Metrics/PerceivedComplexity + # rubocop:enable Metrics/CyclomaticComplexity + # rubocop:enable Metrics/AbcSize + + def call + return @text if MarkdownFilter.glfm_markdown?(context) + + regex = Gitlab::UntrustedRegexp.new(MARKDOWN_GRID_TABLE_BLOCK_REGEX, multiline: true) + return @text unless regex.match?(@text) + + regex.replace_gsub(@text) do |match| + # Extract the grid table content from the match + grid_table = match[:code] + if grid_table + # Convert grid table to HTML table + generate_html_table_with_spans(grid_table) + else + # Return original text if no grid table found + match.to_s + end + end + end + end + end +end diff --git a/lib/banzai/pipeline/plain_markdown_pipeline.rb b/lib/banzai/pipeline/plain_markdown_pipeline.rb index 19aadb0cc049fe..f64437749c2ca6 100644 --- a/lib/banzai/pipeline/plain_markdown_pipeline.rb +++ b/lib/banzai/pipeline/plain_markdown_pipeline.rb @@ -6,6 +6,7 @@ class PlainMarkdownPipeline < BasePipeline def self.filters FilterArray[ Filter::IncludeFilter, + Filter::GridTableFilter, Filter::MarkdownFilter, Filter::ConvertTextToDocFilter, ] -- GitLab From 3c44dbba54dd9c395cd05aba4908faa2591b5aa0 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Wed, 26 Feb 2025 17:30:10 -0600 Subject: [PATCH 02/12] Fix markdown guard --- lib/banzai/filter/grid_table_filter.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index 1998287fd3efa4..6e9a3bd18a1cdb 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -509,7 +509,7 @@ def generate_html_table_with_spans(pandoc_table) # rubocop:enable Metrics/AbcSize def call - return @text if MarkdownFilter.glfm_markdown?(context) + return @text unless MarkdownFilter.glfm_markdown?(context) regex = Gitlab::UntrustedRegexp.new(MARKDOWN_GRID_TABLE_BLOCK_REGEX, multiline: true) return @text unless regex.match?(@text) -- GitLab From 8e15010c10783e3a679d28de1c7b47077c92bf8f Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Wed, 26 Feb 2025 17:32:04 -0600 Subject: [PATCH 03/12] Fix regex --- lib/banzai/filter/grid_table_filter.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index 6e9a3bd18a1cdb..28770e1b83307c 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -25,9 +25,9 @@ class GridTableFilter < HTML::Pipeline::TextFilter # Anything, starting with | blocks which are ignored by this filter # +---+---+---+---+ - ^\s*\+-.*\+\s$ # First separator line + ^\s*\+(-+\+)+$ # First separator line (?:.*\n)*? # Any number of rows (non-greedy) - \s*\+-.*\+\s$ # Last separator line + \s*\+(-+\+)+$ # Last separator line ) }mx -- GitLab From 69e2a124e2fbaea2e04cbc3af51d91516e9e1ced Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Wed, 26 Feb 2025 17:34:05 -0600 Subject: [PATCH 04/12] =?UTF-8?q?Don=E2=80=99t=20use=20Gitlab::UntrustedRe?= =?UTF-8?q?gexp=20for=20now?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/banzai/filter/grid_table_filter.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index 28770e1b83307c..0f00b0ccc457df 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -511,10 +511,12 @@ def generate_html_table_with_spans(pandoc_table) def call return @text unless MarkdownFilter.glfm_markdown?(context) - regex = Gitlab::UntrustedRegexp.new(MARKDOWN_GRID_TABLE_BLOCK_REGEX, multiline: true) + regex = MARKDOWN_GRID_TABLE_BLOCK_REGEX return @text unless regex.match?(@text) - regex.replace_gsub(@text) do |match| + @text.gsub(regex) do + match = Regexp.last_match + # Extract the grid table content from the match grid_table = match[:code] if grid_table -- GitLab From fccd3a46dbc020cab30dcc7343319a9f39cb6690 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Fri, 7 Mar 2025 10:06:21 -0600 Subject: [PATCH 05/12] Latest changes made by @reinaortega --- lib/banzai/filter/grid_table_filter.rb | 214 +++++++++++++------------ 1 file changed, 112 insertions(+), 102 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index 0f00b0ccc457df..a926e68f9e3e2f 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -25,51 +25,50 @@ class GridTableFilter < HTML::Pipeline::TextFilter # Anything, starting with | blocks which are ignored by this filter # +---+---+---+---+ - ^\s*\+(-+\+)+$ # First separator line - (?:.*\n)*? # Any number of rows (non-greedy) - \s*\+(-+\+)+$ # Last separator line + ^\s*\+(-+\+)+$\n # First separator line + (?:^\s*[|+][^\n]*$\n)* + ^\s*\+(-+\+)+$ # Last separator line + ) }mx require 'logger' # Add these regex constants at the top of the file, after the require statement - GRID_TABLE_SEPARATOR = /\s*\+([-:=]+\+)+\s*$/ - GRID_TABLE_HEADER_SEPARATOR = /.*\+([=:]+\+)+.*$/ - GRID_TABLE_BODY_SEPARATOR = /.*\+([:-]+\+)+.*$/ - GRID_TABLE_BODY_SEPARATOR_LINE = /[-:]+$/ + GRID_TABLE_SEPARATOR = /^\s*\+([-:=]+\+)+\s*$/ + GRID_TABLE_HEADER_SEPARATOR = /^\s*\+([=:]+\+)+\s*$/ + GRID_TABLE_BODY_SEPARATOR = /[^\n]*\+([:-]+\+)+[^\n]*$/ + GRID_TABLE_BODY_SEPARATOR_LINE = /^[-:]+$/ class Cell - attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position, :list_flag - - def initialize - @content = nil - @rowspan = 0 - @colspan = 0 - @colspan_adjusted = false - @alignment = 'align="center"' - @position = nil - @list_flag = false - end - - def set_alignment(default_alignments, header_delimiter_positions) - header_delimiter_index = 0 - - while header_delimiter_index < default_alignments.length && - @position > header_delimiter_positions[header_delimiter_index] - header_delimiter_index += 1 + attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position, :list_flag + + def initialize + @content = nil + @rowspan = 0 + @colspan = 0 + @colspan_adjusted = false + @alignment = 'align="center"' + @position = nil + @list_flag = false end - raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length + def set_alignment(default_alignments, header_delimiter_positions) + header_delimiter_index = 0 + while header_delimiter_index < default_alignments.length && + @position > header_delimiter_positions[header_delimiter_index] + header_delimiter_index += 1 + end - if @position < header_delimiter_positions[header_delimiter_index] - @alignment = default_alignments[header_delimiter_index] - elsif @position == header_delimiter_positions[header_delimiter_index] - @alignment = default_alignments[header_delimiter_index] - header_delimiter_index + 1 + raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length + if @position < header_delimiter_positions[header_delimiter_index] + @alignment = default_alignments[header_delimiter_index] + elsif @position == header_delimiter_positions[header_delimiter_index] + @alignment = default_alignments[header_delimiter_index] + header_delimiter_index += 1 + end end - end - end + end # end of class Cell class Row attr_accessor :cells @@ -85,7 +84,7 @@ def [](index) def []=(index, value) @cells[index] = value end - end + end # end of class Row class RowTracker attr_accessor :row_tracker @@ -101,13 +100,16 @@ def [](index) def []=(index, value) @row_tracker[index] = value end - end + + def maxValue + @row_tracker.max + end + end # end of class RowTracker # Helper method to detect separator lines def separator?(line) GRID_TABLE_SEPARATOR.match?(line) end - # Helper method to handle content in cells def handling_content(cell, content) if cell.content.nil? @@ -183,9 +185,11 @@ def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, # rubocop:disable Metrics/CyclomaticComplexity -- PoC # rubocop:disable Metrics/PerceivedComplexity -- PoC def parse_pandoc_table_with_spans(pandoc_table) + # Split the input into lines lines = pandoc_table.strip.split("\n").map(&:strip) + # Retrieve separator indices separator_indices = lines.each_index.select { |i| separator?(lines[i]) } raise "No valid separators found in the provided Pandoc table." if separator_indices.empty? @@ -193,8 +197,9 @@ def parse_pandoc_table_with_spans(pandoc_table) # Calculate max number of columns and delimiter positions delimiter_positions = [] number_of_columns = separator_indices.map { |i| lines[i].count("+") - 1 }.max - separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns } + # Determine delimiter positions + separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns } number_of_columns.times do |j| start_pos = j == 0 ? 0 : delimiter_positions[j - 1] pos = lines[separator_index_max_columns][start_pos + 1..]&.index("+") @@ -208,6 +213,7 @@ def parse_pandoc_table_with_spans(pandoc_table) header_rows = [] header_separator_index = nil + # Determine header delimiter positions separator_indices.each do |index| next unless GRID_TABLE_HEADER_SEPARATOR.match?(lines[index]) @@ -231,7 +237,7 @@ def parse_pandoc_table_with_spans(pandoc_table) break end - # Process table body + # Process table body (including rows belonging to header as they are processed in the same way) data_rows = [] (separator_indices.length - 1).times do |row| @@ -245,6 +251,7 @@ def parse_pandoc_table_with_spans(pandoc_table) next if row_lines.empty? row_lines.each do |line| + # First line (normally a separator) of each block if separator?(line) && !in_data_row in_data_row = true parts = line.strip.delete_prefix("+").split("+") @@ -263,8 +270,10 @@ def parse_pandoc_table_with_spans(pandoc_table) i += 1 while delimiter_index > delimiter_positions[i] i += 1 end + # Lines in a block elsif in_data_row - if GRID_TABLE_BODY_SEPARATOR.match?(line) + # Regular data row or partial separator + if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator cells_content = line.strip .delete_prefix("|") .delete_prefix("+") @@ -291,22 +300,22 @@ def parse_pandoc_table_with_spans(pandoc_table) raise "More cells than columns found" unless cells_content.length <= number_of_columns column_index = 0 - + maxRowTracker = rows_tracker.maxValue cells_content.each_with_index do |content, _i| - if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) - rows_tracker[column_index] += 1 + if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) # Separator - split row + rows_tracker[column_index] = maxRowTracker + 1 rows[rows_tracker[column_index]][column_index].list_flag = false column_forward = 0 (column_index...delimiter_positions.length).each do |del_index| if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index] column_forward += 1 - rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 + #rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 end end column_index += column_forward - else + else # Regular cell in Partial separator line rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], content) rows[rows_tracker[column_index]][column_index].rowspan += 1 @@ -329,8 +338,8 @@ def parse_pandoc_table_with_spans(pandoc_table) end end - else - cells_content = line.strip.delete_prefix("|").split(/\s*\|\s*/) + else # Data row + cells_content = line.strip.delete_prefix("|").split(/\|/) column_index = 0 if cells_content.length < number_of_columns @@ -424,36 +433,64 @@ def parse_pandoc_table_with_spans(pandoc_table) end [header_rows, data_rows] - end + end # end of parse_pandoc_table_with_spans def generate_html_table_with_spans(pandoc_table) - grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - rescue StandardError => e - logger = Logger.new($stdout) - logger.error("Grid table could not be generated: #{e.message}") - - "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" - else - html = "\n" - has_header = false + begin + grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) + rescue StandardError => e + logger = Logger.new($stdout) + logger.error("Grid table could not be generated: #{e.message}") + + "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" + else + html = "
\n" + has_header = false - grid_header.each do |row| - row.each do |cell| - if cell.rowspan != 0 && cell.colspan != 0 - has_header = true - break + grid_header.each do |row| + row.each do |cell| + if cell.rowspan != 0 && cell.colspan != 0 + has_header = true + break + end end end - end - if has_header - html += " \n" - grid_header.each do |row| + if has_header + html += " \n" + grid_header.each do |row| + html += " \n" + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + # Prepare content, in case there's a list + matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) + if matches + list = "
    " + matches.each do |match| + list += "
  • #{match[1]}
  • " + end + list += "
" + cell.content = cell.content.gsub(%r{(\s*([-*+]|\d+\.)\s+[^<]+?
)+}, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'align="left"' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %( #{cell.content}\n) + end + html += " \n" + end + html += " \n" + end + + html += " \n" + grid_body.each do |row| html += " \n" row.each do |cell| next if cell.rowspan == 0 || cell.colspan == 0 - # Prepare content, in case there's a list matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) if matches list = "
    " @@ -468,42 +505,16 @@ def generate_html_table_with_spans(pandoc_table) rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %( #{cell.content}\n) + html += %( #{cell.content}\n) end html += "
\n" end - html += " \n" - end - - html += " \n" - grid_body.each do |row| - html += " \n" - row.each do |cell| - next if cell.rowspan == 0 || cell.colspan == 0 - - matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) - if matches - list = "
    " - matches.each do |match| - list += "
  • #{match[1]}
  • " - end - list += "
" - cell.content = cell.content.gsub(%r{(\s*([-*+]|\d+\.)\s+[^<]+?
)+}, list) - # Enforce left alignment if cell contains a list - cell.alignment = 'align="left"' - end - rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" - colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %( #{cell.content}\n) - end - html += " \n" + html += " \n" + html += "
" + html end - - html += " \n" - html += "" - html - end + end # end of def generate_html_table_with_spans # rubocop:enable Metrics/PerceivedComplexity # rubocop:enable Metrics/CyclomaticComplexity # rubocop:enable Metrics/AbcSize @@ -516,7 +527,6 @@ def call @text.gsub(regex) do match = Regexp.last_match - # Extract the grid table content from the match grid_table = match[:code] if grid_table @@ -527,7 +537,7 @@ def call match.to_s end end - end - end - end -end + end # end of def call + end # end of class GridTableFilter + end # end of module Filter +end # end of module Banzai -- GitLab From 75d089cc9641e9720a0f2c39359bdb1650203a41 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Thu, 20 Mar 2025 10:50:12 -0500 Subject: [PATCH 06/12] Update from customer --- lib/banzai/filter/grid_table_filter.rb | 197 ++++++++++++++++--------- 1 file changed, 127 insertions(+), 70 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index a926e68f9e3e2f..f1fe8c4fd4868b 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -40,33 +40,36 @@ class GridTableFilter < HTML::Pipeline::TextFilter GRID_TABLE_BODY_SEPARATOR = /[^\n]*\+([:-]+\+)+[^\n]*$/ GRID_TABLE_BODY_SEPARATOR_LINE = /^[-:]+$/ + NEXT_ELEMENT_LIST_MARK = "∆" + class Cell - attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position, :list_flag + attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position_start, :position, :list_flag def initialize - @content = nil - @rowspan = 0 - @colspan = 0 - @colspan_adjusted = false - @alignment = 'align="center"' - @position = nil - @list_flag = false + @content = nil + @rowspan = 0 + @colspan = 0 + @colspan_adjusted = false + @alignment = 'align="center"' + @position_start = nil + @position = nil + @list_flag = false end - def set_alignment(default_alignments, header_delimiter_positions) + def calculateAndSetAlignment(header_delimiter_positions, default_alignments ) + + raise "Cell position must be set before calculating alignment" if @position.nil? || @position_start.nil? + header_delimiter_index = 0 while header_delimiter_index < default_alignments.length && - @position > header_delimiter_positions[header_delimiter_index] + @position_start > header_delimiter_positions[header_delimiter_index] header_delimiter_index += 1 end raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length - if @position < header_delimiter_positions[header_delimiter_index] - @alignment = default_alignments[header_delimiter_index] - elsif @position == header_delimiter_positions[header_delimiter_index] - @alignment = default_alignments[header_delimiter_index] - header_delimiter_index += 1 - end + + @alignment = default_alignments[header_delimiter_index] + end end # end of class Cell @@ -112,34 +115,40 @@ def separator?(line) end # Helper method to handle content in cells def handling_content(cell, content) + _c = content.strip if cell.content.nil? cell.rowspan += 1 cell.colspan += 1 - if content.strip.start_with?("- ") # List + if _c.start_with?("- ") # List cell.list_flag = true - cell.content = "#{content.strip}\n" + _c = _c.gsub(/\\\s*$/, '\n') + cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" # Add list element end mark to know when the list element ends elsif cell.list_flag && !content.strip.empty? - cell.content += "#{content.strip}\n" - elsif content.strip == "" - cell.list_flag = false + _c = _c.gsub(/\\\s*$/, '\n') + cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" #add the list element end mark + elsif _c.empty? cell.content = "\n" else - cell.content = content.strip.gsub(/\\\s*$/, "\n") + cell.content = _c.gsub(/\\\s*$/, "\n") end - elsif content.strip.start_with?("- ") + elsif _c.start_with?("- ") cell.content += "\n" unless cell.list_flag - cell.list_flag = true - cell.content += "#{content.strip}\n" - elsif cell.list_flag && !content.strip.empty? - cell.content = cell.content.strip.chomp("\n") - cell.content += " #{content.strip}\n" - elsif content.strip.empty? - cell.list_flag = false + _c = _c.gsub(/\\\s*$/, '\n') + cell.content += "#{_c}#{NEXT_ELEMENT_LIST_MARK}" + elsif cell.list_flag && !_c.empty? + cell.content = cell.content.strip.chomp("#{NEXT_ELEMENT_LIST_MARK}") + _c = _c.gsub(/\\\s*$/, '\n') + cell.content += " #{_c}#{NEXT_ELEMENT_LIST_MARK}" + elsif _c.empty? + if cell.list_flag + cell.list_flag = false + cell.content += "\n\n" + end cell.content += cell.content.end_with?("\n") ? "" : "\n" else - content = content.strip.gsub(/\\\s*$/, "\n") - cell.content += " #{content}" + _c = _c.gsub(/\\\s*$/, "\n") + cell.content += " #{_c}" end cell @@ -158,9 +167,9 @@ def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiters = ['|', '+'] positions = delimiters.filter_map do |delimiter| - pos = line[delimiter_start + 1..]&.index(delimiter) + pos = line[delimiter_start + 1..-1]&.index(delimiter) pos ? pos + delimiter_start + 1 : nil - end + end.compact position = positions.min @@ -181,15 +190,41 @@ def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, row[column_index] end + def checkDelimiterAlignment(line, delimiterPositions) + return false if line.empty? || delimiterPositions.empty? + + #puts "\nChecking line: #{line}" + #puts "Expected delimiter positions: #{delimiterPositions}" + + # For any row (only +, only |, mix of + and |) + currentPositions = [] + start_pos = 1 + + while start_pos < line.length + pos = line.index(/[|+]/, start_pos) # Find the next occurrence of | or + starting from start_pos + break if pos.nil? # Exit if no more delimiters are found + + currentPositions << pos + start_pos = pos + 1 # Move to the next character after the found delimiter + end + + #puts "Current positions: #{currentPositions}" + + # Check if the last expected delimiter position is found in currentPositions + return currentPositions.include?(delimiterPositions[-1]) && + line.match?(/\A[|+]/) && # Check if the line starts with | or + + currentPositions.all? { |pos| delimiterPositions.include?(pos) } # Ensure all current positions are in delimiterPositions + end + # rubocop:disable Metrics/AbcSize -- PoC # rubocop:disable Metrics/CyclomaticComplexity -- PoC # rubocop:disable Metrics/PerceivedComplexity -- PoC def parse_pandoc_table_with_spans(pandoc_table) # Split the input into lines - lines = pandoc_table.strip.split("\n").map(&:strip) + lines = pandoc_table.rstrip.split("\n").map(&:rstrip) - # Retrieve separator indices + # Retrieve separator indices separator_indices = lines.each_index.select { |i| separator?(lines[i]) } raise "No valid separators found in the provided Pandoc table." if separator_indices.empty? @@ -198,11 +233,11 @@ def parse_pandoc_table_with_spans(pandoc_table) delimiter_positions = [] number_of_columns = separator_indices.map { |i| lines[i].count("+") - 1 }.max - # Determine delimiter positions + # Determine delimiter positions separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns } number_of_columns.times do |j| start_pos = j == 0 ? 0 : delimiter_positions[j - 1] - pos = lines[separator_index_max_columns][start_pos + 1..]&.index("+") + pos = lines[separator_index_max_columns][start_pos + 1..-1]&.index("+") delimiter_positions << (pos ? pos + start_pos + 1 : -1) end @@ -231,12 +266,36 @@ def parse_pandoc_table_with_spans(pandoc_table) end start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] - pos = lines[index][start_pos + 1..]&.index("+") + pos = lines[index][start_pos + 1..-1]&.index("+") header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) end break end + unless has_header + # Set default alignments from the first separator which takes the role of header + header_separator_index = 0 + parts = lines[0].strip.delete_prefix("+").split("+") + + parts.each_with_index do |part, part_index| + default_alignments << if part.start_with?(":") && !part.end_with?(":") + 'align="left"' + elsif !part.start_with?(":") && part.end_with?(":") + 'align="right"' + else + 'align="center"' + end + + start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] + pos = lines[0][start_pos + 1..-1]&.index("+") + header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) + end + end + + #Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in table separators: #{lines[-1]}" unless checkDelimiterAlignment(lines[-1], delimiter_positions) + + # Process table body (including rows belonging to header as they are processed in the same way) data_rows = [] @@ -251,9 +310,13 @@ def parse_pandoc_table_with_spans(pandoc_table) next if row_lines.empty? row_lines.each do |line| + line = line.rstrip # First line (normally a separator) of each block if separator?(line) && !in_data_row in_data_row = true + #Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in separator row: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + parts = line.strip.delete_prefix("+").split("+") delimiter_index = 0 rows << Row.new(number_of_columns) @@ -264,22 +327,21 @@ def parse_pandoc_table_with_spans(pandoc_table) next unless i < number_of_columns delimiter_index += parts[j].length + 1 + rows[-1][i].position_start = delimiter_index - parts[j].length rows[-1][i].position = delimiter_index - rows[-1][i].set_alignment(default_alignments, header_delimiter_positions) + rows[-1][i].calculateAndSetAlignment(header_delimiter_positions, default_alignments ) i += 1 while delimiter_index > delimiter_positions[i] i += 1 end - # Lines in a block + # Lines in a block elsif in_data_row # Regular data row or partial separator - if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator - cells_content = line.strip - .delete_prefix("|") - .delete_prefix("+") - .delete_suffix("|") - .delete_suffix("+") - .split(/[\|\+]/) + if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator + #Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in partial separator: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + + cells_content = line.strip.gsub(/^(\+|\|)/, '').split(/[\|\+]/) rows << Row.new(number_of_columns) aux_delimiter_index = 0 @@ -289,8 +351,9 @@ def parse_pandoc_table_with_spans(pandoc_table) next unless auxiliar_cell_index < number_of_columns aux_delimiter_index += cells_content[i].length + 1 + rows[-1][auxiliar_cell_index].position_start = aux_delimiter_index - cells_content[i].length rows[-1][auxiliar_cell_index].position = aux_delimiter_index - rows[-1][auxiliar_cell_index].set_alignment(default_alignments, header_delimiter_positions) + rows[-1][auxiliar_cell_index].calculateAndSetAlignment(header_delimiter_positions, default_alignments ) auxiliar_cell_index += 1 while aux_delimiter_index > delimiter_positions[auxiliar_cell_index] @@ -301,6 +364,7 @@ def parse_pandoc_table_with_spans(pandoc_table) column_index = 0 maxRowTracker = rows_tracker.maxValue + cells_content.each_with_index do |content, _i| if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) # Separator - split row rows_tracker[column_index] = maxRowTracker + 1 @@ -310,7 +374,6 @@ def parse_pandoc_table_with_spans(pandoc_table) (column_index...delimiter_positions.length).each do |del_index| if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index] column_forward += 1 - #rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 end end @@ -337,9 +400,11 @@ def parse_pandoc_table_with_spans(pandoc_table) end end end - else # Data row cells_content = line.strip.delete_prefix("|").split(/\|/) + #Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in row: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + column_index = 0 if cells_content.length < number_of_columns @@ -379,10 +444,12 @@ def parse_pandoc_table_with_spans(pandoc_table) rows.each { |body_row| data_rows << body_row.cells } elsif has_header && start < header_separator_index rows.each { |header_row| header_rows << header_row.cells } + else + rows.each { |body_row| data_rows << body_row.cells } end - end - raise "No valid rows found in the provided Pandoc table." if data_rows.empty? && header_rows.empty? + raise "No valid rows found in the provided Pandoc table." if data_rows.empty? && header_rows.empty? + end # Format text (bold and italic) [header_rows, data_rows].each do |rows| @@ -390,20 +457,11 @@ def parse_pandoc_table_with_spans(pandoc_table) row.each do |cell| next if cell.content.nil? - delimters = ['**', '__'] - delimters.each do |bold_chars| - while cell.content.include?(bold_chars) - cell.content = cell.content.sub(bold_chars, "") - .sub(bold_chars, "") - end - end - - while cell.content.include?("_") && cell.content.exclude?("\\_") - cell.content = cell.content.rstrip.sub("_", "").sub("_", "") - end + cell.content = cell.content.gsub(/^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, "\\k\\k") + cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, "\\k\\k") # Convert newlines to HTML breaks cell.content = cell.content&.gsub("\n", "
") end @@ -464,14 +522,14 @@ def generate_html_table_with_spans(pandoc_table) next if cell.rowspan == 0 || cell.colspan == 0 # Prepare content, in case there's a list - matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) + matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/) if matches list = "
    " matches.each do |match| list += "
  • #{match[1]}
  • " end list += "
" - cell.content = cell.content.gsub(%r{(\s*([-*+]|\d+\.)\s+[^<]+?
)+}, list) + cell.content = cell.content.gsub(/(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/, list) # Enforce left alignment if cell contains a list cell.alignment = 'align="left"' end @@ -491,14 +549,13 @@ def generate_html_table_with_spans(pandoc_table) row.each do |cell| next if cell.rowspan == 0 || cell.colspan == 0 - matches = cell.content&.scan(%r{\s*([-*+]|\d+\.)\s+([^<]+?)(?=
|$)}) + matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/) if matches list = "
    " matches.each do |match| list += "
  • #{match[1]}
  • " end - list += "
" - cell.content = cell.content.gsub(%r{(\s*([-*+]|\d+\.)\s+[^<]+?
)+}, list) + cell.content = cell.content.gsub(/(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/, list) # Enforce left alignment if cell contains a list cell.alignment = 'align="left"' end -- GitLab From 07ba1cffe66e384e8e5ef95925e165a1e4f9c88c Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Tue, 25 Mar 2025 12:23:24 -0500 Subject: [PATCH 07/12] Rubocop auto corrections --- lib/banzai/filter/grid_table_filter.rb | 254 +++++++++++++------------ 1 file changed, 131 insertions(+), 123 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index f1fe8c4fd4868b..952d6cb13387b6 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -43,35 +43,34 @@ class GridTableFilter < HTML::Pipeline::TextFilter NEXT_ELEMENT_LIST_MARK = "∆" class Cell - attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position_start, :position, :list_flag - - def initialize - @content = nil - @rowspan = 0 - @colspan = 0 - @colspan_adjusted = false - @alignment = 'align="center"' - @position_start = nil - @position = nil - @list_flag = false - end - - def calculateAndSetAlignment(header_delimiter_positions, default_alignments ) - - raise "Cell position must be set before calculating alignment" if @position.nil? || @position_start.nil? + attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position_start, :position, + :list_flag + + def initialize + @content = nil + @rowspan = 0 + @colspan = 0 + @colspan_adjusted = false + @alignment = 'align="center"' + @position_start = nil + @position = nil + @list_flag = false + end - header_delimiter_index = 0 - while header_delimiter_index < default_alignments.length && - @position_start > header_delimiter_positions[header_delimiter_index] - header_delimiter_index += 1 - end + def calculateAndSetAlignment(header_delimiter_positions, default_alignments) + raise "Cell position must be set before calculating alignment" if @position.nil? || @position_start.nil? - raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length + header_delimiter_index = 0 + while header_delimiter_index < default_alignments.length && + @position_start > header_delimiter_positions[header_delimiter_index] + header_delimiter_index += 1 + end - @alignment = default_alignments[header_delimiter_index] + raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length - end - end # end of class Cell + @alignment = default_alignments[header_delimiter_index] + end + end class Row attr_accessor :cells @@ -87,7 +86,7 @@ def [](index) def []=(index, value) @cells[index] = value end - end # end of class Row + end class RowTracker attr_accessor :row_tracker @@ -107,12 +106,13 @@ def []=(index, value) def maxValue @row_tracker.max end - end # end of class RowTracker + end # Helper method to detect separator lines def separator?(line) GRID_TABLE_SEPARATOR.match?(line) end + # Helper method to handle content in cells def handling_content(cell, content) _c = content.strip @@ -122,10 +122,10 @@ def handling_content(cell, content) if _c.start_with?("- ") # List cell.list_flag = true _c = _c.gsub(/\\\s*$/, '\n') - cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" # Add list element end mark to know when the list element ends + cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" # Add list element end mark to know when the list element ends elsif cell.list_flag && !content.strip.empty? _c = _c.gsub(/\\\s*$/, '\n') - cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" #add the list element end mark + cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" # add the list element end mark elsif _c.empty? cell.content = "\n" else @@ -137,7 +137,7 @@ def handling_content(cell, content) _c = _c.gsub(/\\\s*$/, '\n') cell.content += "#{_c}#{NEXT_ELEMENT_LIST_MARK}" elsif cell.list_flag && !_c.empty? - cell.content = cell.content.strip.chomp("#{NEXT_ELEMENT_LIST_MARK}") + cell.content = cell.content.strip.chomp(NEXT_ELEMENT_LIST_MARK.to_s) _c = _c.gsub(/\\\s*$/, '\n') cell.content += " #{_c}#{NEXT_ELEMENT_LIST_MARK}" elsif _c.empty? @@ -145,6 +145,7 @@ def handling_content(cell, content) cell.list_flag = false cell.content += "\n\n" end + cell.content += cell.content.end_with?("\n") ? "" : "\n" else _c = _c.gsub(/\\\s*$/, "\n") @@ -167,7 +168,7 @@ def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiters = ['|', '+'] positions = delimiters.filter_map do |delimiter| - pos = line[delimiter_start + 1..-1]&.index(delimiter) + pos = line[delimiter_start + 1..]&.index(delimiter) pos ? pos + delimiter_start + 1 : nil end.compact @@ -193,34 +194,36 @@ def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, def checkDelimiterAlignment(line, delimiterPositions) return false if line.empty? || delimiterPositions.empty? - #puts "\nChecking line: #{line}" - #puts "Expected delimiter positions: #{delimiterPositions}" + # puts "\nChecking line: #{line}" + # puts "Expected delimiter positions: #{delimiterPositions}" # For any row (only +, only |, mix of + and |) currentPositions = [] start_pos = 1 while start_pos < line.length - pos = line.index(/[|+]/, start_pos) # Find the next occurrence of | or + starting from start_pos - break if pos.nil? # Exit if no more delimiters are found + pos = line.index(/[|+]/, start_pos) # Find the next occurrence of | or + starting from start_pos + break if pos.nil? # Exit if no more delimiters are found currentPositions << pos - start_pos = pos + 1 # Move to the next character after the found delimiter + start_pos = pos + 1 # Move to the next character after the found delimiter end - #puts "Current positions: #{currentPositions}" + # puts "Current positions: #{currentPositions}" # Check if the last expected delimiter position is found in currentPositions - return currentPositions.include?(delimiterPositions[-1]) && - line.match?(/\A[|+]/) && # Check if the line starts with | or + - currentPositions.all? { |pos| delimiterPositions.include?(pos) } # Ensure all current positions are in delimiterPositions + currentPositions.include?(delimiterPositions[-1]) && + line.match?(/\A[|+]/) && # Check if the line starts with | or + + # Ensure all current positions are in delimiterPositions + currentPositions.all? do |pos| + delimiterPositions.include?(pos) + end end # rubocop:disable Metrics/AbcSize -- PoC # rubocop:disable Metrics/CyclomaticComplexity -- PoC # rubocop:disable Metrics/PerceivedComplexity -- PoC def parse_pandoc_table_with_spans(pandoc_table) - # Split the input into lines lines = pandoc_table.rstrip.split("\n").map(&:rstrip) @@ -237,7 +240,7 @@ def parse_pandoc_table_with_spans(pandoc_table) separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns } number_of_columns.times do |j| start_pos = j == 0 ? 0 : delimiter_positions[j - 1] - pos = lines[separator_index_max_columns][start_pos + 1..-1]&.index("+") + pos = lines[separator_index_max_columns][start_pos + 1..]&.index("+") delimiter_positions << (pos ? pos + start_pos + 1 : -1) end @@ -266,7 +269,7 @@ def parse_pandoc_table_with_spans(pandoc_table) end start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] - pos = lines[index][start_pos + 1..-1]&.index("+") + pos = lines[index][start_pos + 1..]&.index("+") header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) end break @@ -279,22 +282,22 @@ def parse_pandoc_table_with_spans(pandoc_table) parts.each_with_index do |part, part_index| default_alignments << if part.start_with?(":") && !part.end_with?(":") - 'align="left"' - elsif !part.start_with?(":") && part.end_with?(":") - 'align="right"' - else - 'align="center"' - end + 'align="left"' + elsif !part.start_with?(":") && part.end_with?(":") + 'align="right"' + else + 'align="center"' + end start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] - pos = lines[0][start_pos + 1..-1]&.index("+") + pos = lines[0][start_pos + 1..]&.index("+") header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) end end - #Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in table separators: #{lines[-1]}" unless checkDelimiterAlignment(lines[-1], delimiter_positions) - + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in table separators: #{lines[-1]}" unless checkDelimiterAlignment(lines[-1], + delimiter_positions) # Process table body (including rows belonging to header as they are processed in the same way) data_rows = [] @@ -314,8 +317,9 @@ def parse_pandoc_table_with_spans(pandoc_table) # First line (normally a separator) of each block if separator?(line) && !in_data_row in_data_row = true - #Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in separator row: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in separator row: #{line}" unless checkDelimiterAlignment(line, + delimiter_positions) parts = line.strip.delete_prefix("+").split("+") delimiter_index = 0 @@ -329,7 +333,7 @@ def parse_pandoc_table_with_spans(pandoc_table) delimiter_index += parts[j].length + 1 rows[-1][i].position_start = delimiter_index - parts[j].length rows[-1][i].position = delimiter_index - rows[-1][i].calculateAndSetAlignment(header_delimiter_positions, default_alignments ) + rows[-1][i].calculateAndSetAlignment(header_delimiter_positions, default_alignments) i += 1 while delimiter_index > delimiter_positions[i] i += 1 @@ -338,8 +342,9 @@ def parse_pandoc_table_with_spans(pandoc_table) elsif in_data_row # Regular data row or partial separator if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator - #Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in partial separator: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in partial separator: #{line}" unless checkDelimiterAlignment(line, + delimiter_positions) cells_content = line.strip.gsub(/^(\+|\|)/, '').split(/[\|\+]/) @@ -353,7 +358,8 @@ def parse_pandoc_table_with_spans(pandoc_table) aux_delimiter_index += cells_content[i].length + 1 rows[-1][auxiliar_cell_index].position_start = aux_delimiter_index - cells_content[i].length rows[-1][auxiliar_cell_index].position = aux_delimiter_index - rows[-1][auxiliar_cell_index].calculateAndSetAlignment(header_delimiter_positions, default_alignments ) + rows[-1][auxiliar_cell_index].calculateAndSetAlignment(header_delimiter_positions, + default_alignments) auxiliar_cell_index += 1 while aux_delimiter_index > delimiter_positions[auxiliar_cell_index] @@ -401,9 +407,9 @@ def parse_pandoc_table_with_spans(pandoc_table) end end else # Data row - cells_content = line.strip.delete_prefix("|").split(/\|/) - #Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in row: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + cells_content = line.strip.delete_prefix("|").split("|") + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in row: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) column_index = 0 @@ -459,9 +465,11 @@ def parse_pandoc_table_with_spans(pandoc_table) cell.content = cell.content.gsub(/^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, "\\k\\k") + cell.content = cell.content.gsub(/(?^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, + "\\k\\k") - cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, "\\k\\k") + cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, + "\\k\\k") # Convert newlines to HTML breaks cell.content = cell.content&.gsub("\n", "
") end @@ -491,87 +499,87 @@ def parse_pandoc_table_with_spans(pandoc_table) end [header_rows, data_rows] - end # end of parse_pandoc_table_with_spans + end def generate_html_table_with_spans(pandoc_table) - begin - grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - rescue StandardError => e - logger = Logger.new($stdout) - logger.error("Grid table could not be generated: #{e.message}") - - "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" - else - html = "\n" - has_header = false + grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) + rescue StandardError => e + logger = Logger.new($stdout) + logger.error("Grid table could not be generated: #{e.message}") + + "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" + else + html = "
\n" + has_header = false - grid_header.each do |row| - row.each do |cell| - if cell.rowspan != 0 && cell.colspan != 0 - has_header = true - break - end + grid_header.each do |row| + row.each do |cell| + if cell.rowspan != 0 && cell.colspan != 0 + has_header = true + break end end + end - if has_header - html += " \n" - grid_header.each do |row| - html += " \n" - row.each do |cell| - next if cell.rowspan == 0 || cell.colspan == 0 - - # Prepare content, in case there's a list - matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/) - if matches - list = "
    " - matches.each do |match| - list += "
  • #{match[1]}
  • " - end - list += "
" - cell.content = cell.content.gsub(/(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/, list) - # Enforce left alignment if cell contains a list - cell.alignment = 'align="left"' - end - - rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" - colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %( #{cell.content}\n) - end - html += " \n" - end - html += " \n" - end - - html += " \n" - grid_body.each do |row| + if has_header + html += " \n" + grid_header.each do |row| html += " \n" row.each do |cell| next if cell.rowspan == 0 || cell.colspan == 0 - matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/) + # Prepare content, in case there's a list + matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) if matches list = "
    " matches.each do |match| list += "
  • #{match[1]}
  • " end - cell.content = cell.content.gsub(/(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/, list) + list += "
" + cell.content = cell.content.gsub( + /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) # Enforce left alignment if cell contains a list cell.alignment = 'align="left"' end rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %( #{cell.content}\n) + html += %( #{cell.content}\n) end html += " \n" end + html += " \n" + end - html += " \n" - html += "
" - html + html += " \n" + grid_body.each do |row| + html += " \n" + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + if matches + list = "
    " + matches.each do |match| + list += "
  • #{match[1]}
  • " + end + cell.content = cell.content.gsub( + /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'align="left"' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %( #{cell.content}\n) + end + html += " \n" end - end # end of def generate_html_table_with_spans + + html += " \n" + html += "" + html + end # rubocop:enable Metrics/PerceivedComplexity # rubocop:enable Metrics/CyclomaticComplexity # rubocop:enable Metrics/AbcSize @@ -594,7 +602,7 @@ def call match.to_s end end - end # end of def call - end # end of class GridTableFilter - end # end of module Filter -end # end of module Banzai + end + end + end +end -- GitLab From 08b0d44ef87931c2ac67221a209bb8bda02c46c6 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Tue, 25 Mar 2025 12:35:47 -0500 Subject: [PATCH 08/12] Additional rubocop fixes --- lib/banzai/filter/grid_table_filter.rb | 95 ++++++++++++++------------ 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index 952d6cb13387b6..980a5b47b9d9f5 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -18,6 +18,7 @@ module Banzai module Filter class GridTableFilter < HTML::Pipeline::TextFilter + # rubocop:disable Lint/MixedRegexpCaptureTypes -- PoC MARKDOWN_GRID_TABLE_BLOCK_REGEX = %r{ (? # Grid table blocks: @@ -31,6 +32,7 @@ class GridTableFilter < HTML::Pipeline::TextFilter ) }mx + # rubocop:enable Lint/MixedRegexpCaptureTypes require 'logger' @@ -57,7 +59,7 @@ def initialize @list_flag = false end - def calculateAndSetAlignment(header_delimiter_positions, default_alignments) + def calculate_and_set_alignment(header_delimiter_positions, default_alignments) raise "Cell position must be set before calculating alignment" if @position.nil? || @position_start.nil? header_delimiter_index = 0 @@ -103,7 +105,7 @@ def []=(index, value) @row_tracker[index] = value end - def maxValue + def max_value @row_tracker.max end end @@ -114,33 +116,36 @@ def separator?(line) end # Helper method to handle content in cells + # rubocop:disable Metrics/PerceivedComplexity -- PoC def handling_content(cell, content) - _c = content.strip + modified_content = content.strip if cell.content.nil? cell.rowspan += 1 cell.colspan += 1 - if _c.start_with?("- ") # List + if modified_content.start_with?("- ") # List cell.list_flag = true - _c = _c.gsub(/\\\s*$/, '\n') - cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" # Add list element end mark to know when the list element ends + modified_content = modified_content.gsub(/\\\s*$/, '\n') + + # Add list element end mark to know when the list element ends + cell.content = "#{modified_content}#{NEXT_ELEMENT_LIST_MARK}" elsif cell.list_flag && !content.strip.empty? - _c = _c.gsub(/\\\s*$/, '\n') - cell.content = "#{_c}#{NEXT_ELEMENT_LIST_MARK}" # add the list element end mark - elsif _c.empty? + modified_content = modified_content.gsub(/\\\s*$/, '\n') + cell.content = "#{modified_content}#{NEXT_ELEMENT_LIST_MARK}" # add the list element end mark + elsif modified_content.empty? cell.content = "\n" else - cell.content = _c.gsub(/\\\s*$/, "\n") + cell.content = modified_content.gsub(/\\\s*$/, "\n") end - elsif _c.start_with?("- ") + elsif modified_content.start_with?("- ") cell.content += "\n" unless cell.list_flag cell.list_flag = true - _c = _c.gsub(/\\\s*$/, '\n') - cell.content += "#{_c}#{NEXT_ELEMENT_LIST_MARK}" - elsif cell.list_flag && !_c.empty? + modified_content = modified_content.gsub(/\\\s*$/, '\n') + cell.content += "#{modified_content}#{NEXT_ELEMENT_LIST_MARK}" + elsif cell.list_flag && !modified_content.empty? cell.content = cell.content.strip.chomp(NEXT_ELEMENT_LIST_MARK.to_s) - _c = _c.gsub(/\\\s*$/, '\n') - cell.content += " #{_c}#{NEXT_ELEMENT_LIST_MARK}" - elsif _c.empty? + modified_content = modified_content.gsub(/\\\s*$/, '\n') + cell.content += " #{modified_content}#{NEXT_ELEMENT_LIST_MARK}" + elsif modified_content.empty? if cell.list_flag cell.list_flag = false cell.content += "\n\n" @@ -148,12 +153,13 @@ def handling_content(cell, content) cell.content += cell.content.end_with?("\n") ? "" : "\n" else - _c = _c.gsub(/\\\s*$/, "\n") - cell.content += " #{_c}" + modified_content = modified_content.gsub(/\\\s*$/, "\n") + cell.content += " #{modified_content}" end cell end + # rubocop:enable Metrics/PerceivedComplexity # Helper method to adjust colspan def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions) @@ -191,32 +197,32 @@ def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, row[column_index] end - def checkDelimiterAlignment(line, delimiterPositions) - return false if line.empty? || delimiterPositions.empty? + def check_delimiter_alignment(line, delimiter_positions) + return false if line.empty? || delimiter_positions.empty? # puts "\nChecking line: #{line}" - # puts "Expected delimiter positions: #{delimiterPositions}" + # puts "Expected delimiter positions: #{delimiter_positions}" # For any row (only +, only |, mix of + and |) - currentPositions = [] + current_positions = [] start_pos = 1 while start_pos < line.length pos = line.index(/[|+]/, start_pos) # Find the next occurrence of | or + starting from start_pos break if pos.nil? # Exit if no more delimiters are found - currentPositions << pos + current_positions << pos start_pos = pos + 1 # Move to the next character after the found delimiter end - # puts "Current positions: #{currentPositions}" + # puts "Current positions: #{current_positions}" - # Check if the last expected delimiter position is found in currentPositions - currentPositions.include?(delimiterPositions[-1]) && + # Check if the last expected delimiter position is found in current_positions + current_positions.include?(delimiter_positions[-1]) && line.match?(/\A[|+]/) && # Check if the line starts with | or + - # Ensure all current positions are in delimiterPositions - currentPositions.all? do |pos| - delimiterPositions.include?(pos) + # Ensure all current positions are in delimiter_positions + current_positions.all? do |pos| + delimiter_positions.include?(pos) end end @@ -296,7 +302,7 @@ def parse_pandoc_table_with_spans(pandoc_table) end # Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in table separators: #{lines[-1]}" unless checkDelimiterAlignment(lines[-1], + raise "Misaligned delimiters in table separators: #{lines[-1]}" unless check_delimiter_alignment(lines[-1], delimiter_positions) # Process table body (including rows belonging to header as they are processed in the same way) @@ -318,7 +324,7 @@ def parse_pandoc_table_with_spans(pandoc_table) if separator?(line) && !in_data_row in_data_row = true # Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in separator row: #{line}" unless checkDelimiterAlignment(line, + raise "Misaligned delimiters in separator row: #{line}" unless check_delimiter_alignment(line, delimiter_positions) parts = line.strip.delete_prefix("+").split("+") @@ -333,7 +339,7 @@ def parse_pandoc_table_with_spans(pandoc_table) delimiter_index += parts[j].length + 1 rows[-1][i].position_start = delimiter_index - parts[j].length rows[-1][i].position = delimiter_index - rows[-1][i].calculateAndSetAlignment(header_delimiter_positions, default_alignments) + rows[-1][i].calculate_and_set_alignment(header_delimiter_positions, default_alignments) i += 1 while delimiter_index > delimiter_positions[i] i += 1 @@ -343,7 +349,7 @@ def parse_pandoc_table_with_spans(pandoc_table) # Regular data row or partial separator if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator # Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in partial separator: #{line}" unless checkDelimiterAlignment(line, + raise "Misaligned delimiters in partial separator: #{line}" unless check_delimiter_alignment(line, delimiter_positions) cells_content = line.strip.gsub(/^(\+|\|)/, '').split(/[\|\+]/) @@ -358,7 +364,7 @@ def parse_pandoc_table_with_spans(pandoc_table) aux_delimiter_index += cells_content[i].length + 1 rows[-1][auxiliar_cell_index].position_start = aux_delimiter_index - cells_content[i].length rows[-1][auxiliar_cell_index].position = aux_delimiter_index - rows[-1][auxiliar_cell_index].calculateAndSetAlignment(header_delimiter_positions, + rows[-1][auxiliar_cell_index].calculate_and_set_alignment(header_delimiter_positions, default_alignments) auxiliar_cell_index += 1 while aux_delimiter_index > delimiter_positions[auxiliar_cell_index] @@ -369,11 +375,11 @@ def parse_pandoc_table_with_spans(pandoc_table) raise "More cells than columns found" unless cells_content.length <= number_of_columns column_index = 0 - maxRowTracker = rows_tracker.maxValue + max_row_tracker = rows_tracker.max_value cells_content.each_with_index do |content, _i| if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) # Separator - split row - rows_tracker[column_index] = maxRowTracker + 1 + rows_tracker[column_index] = max_row_tracker + 1 rows[rows_tracker[column_index]][column_index].list_flag = false column_forward = 0 @@ -409,7 +415,8 @@ def parse_pandoc_table_with_spans(pandoc_table) else # Data row cells_content = line.strip.delete_prefix("|").split("|") # Check end table delimiter alignment (not checked during the lines processing) - raise "Misaligned delimiters in row: #{line}" unless checkDelimiterAlignment(line, delimiter_positions) + raise "Misaligned delimiters in row: #{line}" unless check_delimiter_alignment( + line, delimiter_positions) column_index = 0 @@ -446,9 +453,7 @@ def parse_pandoc_table_with_spans(pandoc_table) end end - if has_header && start >= header_separator_index - rows.each { |body_row| data_rows << body_row.cells } - elsif has_header && start < header_separator_index + if has_header && start < header_separator_index rows.each { |header_row| header_rows << header_row.cells } else rows.each { |body_row| data_rows << body_row.cells } @@ -529,7 +534,9 @@ def generate_html_table_with_spans(pandoc_table) next if cell.rowspan == 0 || cell.colspan == 0 # Prepare content, in case there's a list - matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + matches = cell.content&.scan( + /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + if matches list = "
      " matches.each do |match| @@ -557,7 +564,9 @@ def generate_html_table_with_spans(pandoc_table) row.each do |cell| next if cell.rowspan == 0 || cell.colspan == 0 - matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + matches = cell.content&.scan( + /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + if matches list = "
        " matches.each do |match| -- GitLab From d7cf2c9f6f89cf18e2816ba2c2fb53cebf57c357 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Fri, 29 Aug 2025 11:48:23 -0500 Subject: [PATCH 09/12] Update from customer --- lib/banzai/filter/grid_table_filter.rb | 182 +++++++++++++------------ 1 file changed, 98 insertions(+), 84 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index 980a5b47b9d9f5..d36d05c09b66e1 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -267,11 +267,11 @@ def parse_pandoc_table_with_spans(pandoc_table) parts.each_with_index do |part, part_index| default_alignments << if part.start_with?(":") && !part.end_with?(":") - 'align="left"' + 'left' elsif !part.start_with?(":") && part.end_with?(":") - 'align="right"' + 'right' else - 'align="center"' + 'center' end start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] @@ -284,19 +284,20 @@ def parse_pandoc_table_with_spans(pandoc_table) unless has_header # Set default alignments from the first separator which takes the role of header header_separator_index = 0 - parts = lines[0].strip.delete_prefix("+").split("+") + line = lines.find { |l| !l.strip.empty? } # first non-blank line + parts = line.strip.delete_prefix("+").split("+") parts.each_with_index do |part, part_index| default_alignments << if part.start_with?(":") && !part.end_with?(":") - 'align="left"' - elsif !part.start_with?(":") && part.end_with?(":") - 'align="right"' - else - 'align="center"' - end + 'left' + elsif !part.start_with?(":") && part.end_with?(":") + 'right' + else + 'center' + end start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] - pos = lines[0][start_pos + 1..]&.index("+") + pos = line[start_pos + 1..]&.index("+") header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) end end @@ -352,17 +353,17 @@ def parse_pandoc_table_with_spans(pandoc_table) raise "Misaligned delimiters in partial separator: #{line}" unless check_delimiter_alignment(line, delimiter_positions) - cells_content = line.strip.gsub(/^(\+|\|)/, '').split(/[\|\+]/) + parts = line.strip.gsub(/^(\+|\|)/, '').split(/[\|\+]/) rows << Row.new(number_of_columns) aux_delimiter_index = 0 auxiliar_cell_index = 0 - cells_content.each_with_index do |_, i| + parts.each_with_index do |_, i| next unless auxiliar_cell_index < number_of_columns - aux_delimiter_index += cells_content[i].length + 1 - rows[-1][auxiliar_cell_index].position_start = aux_delimiter_index - cells_content[i].length + aux_delimiter_index += parts[i].length + 1 + rows[-1][auxiliar_cell_index].position_start = aux_delimiter_index - parts[i].length rows[-1][auxiliar_cell_index].position = aux_delimiter_index rows[-1][auxiliar_cell_index].calculate_and_set_alignment(header_delimiter_positions, default_alignments) @@ -372,12 +373,12 @@ def parse_pandoc_table_with_spans(pandoc_table) auxiliar_cell_index += 1 end - raise "More cells than columns found" unless cells_content.length <= number_of_columns + raise "More cells than columns found" unless parts.length <= number_of_columns column_index = 0 max_row_tracker = rows_tracker.max_value - cells_content.each_with_index do |content, _i| + parts.each_with_index do |content, _i| if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) # Separator - split row rows_tracker[column_index] = max_row_tracker + 1 rows[rows_tracker[column_index]][column_index].list_flag = false @@ -418,6 +419,10 @@ def parse_pandoc_table_with_spans(pandoc_table) raise "Misaligned delimiters in row: #{line}" unless check_delimiter_alignment( line, delimiter_positions) + raise "Missing delimiters in previous separator line" if parts.length < cells_content.length + + #raise "Missing delimiters in row: #{line}: delimiters = #{cells_content.length}, expected delimiters = #{parts.length}" if parts.length > cells_content.length + column_index = 0 if cells_content.length < number_of_columns @@ -468,13 +473,13 @@ def parse_pandoc_table_with_spans(pandoc_table) row.each do |cell| next if cell.content.nil? - cell.content = cell.content.gsub(/^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, - "\\k\\k") + #cell.content = cell.content.gsub(/(?^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, + # "\\k\\k") - cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, - "\\k\\k") + #cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, + # "\\k\\k") # Convert newlines to HTML breaks cell.content = cell.content&.gsub("\n", "
        ") end @@ -486,17 +491,24 @@ def parse_pandoc_table_with_spans(pandoc_table) rows.each_with_index do |row, row_index| forward_rowspan = Array.new(row.length, 0) if forward_rowspan.empty? sum = 0 - + row_forward_rowspan = forward_rowspan.dup row.each_with_index do |cell, cell_index| sum += cell.colspan - if row_index > 0 && cell.colspan == 0 - sum += 1 if forward_rowspan[cell_index] > 0 - - forward_rowspan[cell_index] -= 1 + if cell.colspan == 0 + if row_forward_rowspan[cell_index] > 0 + sum += 1 + forward_rowspan[cell_index] -= 1 + end + end + if row_forward_rowspan[cell_index] == 0 && cell.rowspan > 1 + forward_rowspan[cell_index] = cell.rowspan - 1 + colspan = 1 + while cell.colspan > colspan + forward_rowspan[cell_index + colspan] = cell.rowspan - 1 + colspan += 1 + end end - - forward_rowspan[cell_index] = cell.rowspan - 1 if forward_rowspan[cell_index] == 0 && cell.rowspan > 1 end raise "Grid table not converted properly" unless sum == number_of_columns @@ -507,33 +519,64 @@ def parse_pandoc_table_with_spans(pandoc_table) end def generate_html_table_with_spans(pandoc_table) - grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - rescue StandardError => e - logger = Logger.new($stdout) - logger.error("Grid table could not be generated: #{e.message}") - - "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" - else - html = "\n" - has_header = false + begin + grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) + rescue StandardError => e + logger = Logger.new($stdout) + logger.error("Grid table could not be generated: #{e.message}") + + "\n\nHTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOG FILE\n\n#{e.message}\n\nCommit ID: ce3607dbcafafe03531c1c50b3f749cc2318656c\n\n" + else + html = '
        ' + has_header = false + + grid_header.each do |row| + row.each do |cell| + if cell.rowspan != 0 && cell.colspan != 0 + has_header = true + break + end + end + end - grid_header.each do |row| - row.each do |cell| - if cell.rowspan != 0 && cell.colspan != 0 - has_header = true - break + if has_header + html += '' + grid_header.each do |row| + html += '' + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + # Prepare content, in case there's a list + matches = cell.content&.scan( + /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + + if matches + list = "
          " + matches.each do |match| + list += "
        • #{match[1]}
        • " + end + list += "
        " + cell.content = cell.content.gsub( + /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'left' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %(\n\n#{cell.content}\n\n) + end + html += '' end + html += '' end - end - if has_header - html += " \n" - grid_header.each do |row| - html += " \n" + html += '' + grid_body.each do |row| + html += '' row.each do |cell| next if cell.rowspan == 0 || cell.colspan == 0 - # Prepare content, in case there's a list matches = cell.content&.scan( /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) @@ -542,52 +585,23 @@ def generate_html_table_with_spans(pandoc_table) matches.each do |match| list += "
      • #{match[1]}
      • " end - list += "" cell.content = cell.content.gsub( /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) # Enforce left alignment if cell contains a list - cell.alignment = 'align="left"' + cell.alignment = 'left' end rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %( #{cell.content}\n) + html += %(\n\n#{cell.content}\n\n) end - html += " \n" + html += '' end - html += " \n" - end - - html += " \n" - grid_body.each do |row| - html += " \n" - row.each do |cell| - next if cell.rowspan == 0 || cell.colspan == 0 - - matches = cell.content&.scan( - /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) - - if matches - list = "
          " - matches.each do |match| - list += "
        • #{match[1]}
        • " - end - cell.content = cell.content.gsub( - /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) - # Enforce left alignment if cell contains a list - cell.alignment = 'align="left"' - end - rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" - colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %( #{cell.content}\n) - end - html += "
        \n" + html += '' + html += '
        ' + html end - - html += " \n" - html += "" - html end # rubocop:enable Metrics/PerceivedComplexity # rubocop:enable Metrics/CyclomaticComplexity -- GitLab From cecca3efadc401c38c0ff23a468e474500ca27d6 Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Fri, 29 Aug 2025 11:56:11 -0500 Subject: [PATCH 10/12] Rubocop fixes --- lib/banzai/filter/grid_table_filter.rb | 57 ++++++++++++++++++-------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index d36d05c09b66e1..ff5869f8c8f226 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -284,17 +284,17 @@ def parse_pandoc_table_with_spans(pandoc_table) unless has_header # Set default alignments from the first separator which takes the role of header header_separator_index = 0 - line = lines.find { |l| !l.strip.empty? } # first non-blank line + line = lines.find { |l| !l.strip.empty? } # first non-blank line parts = line.strip.delete_prefix("+").split("+") parts.each_with_index do |part, part_index| default_alignments << if part.start_with?(":") && !part.end_with?(":") - 'left' - elsif !part.start_with?(":") && part.end_with?(":") - 'right' - else - 'center' - end + 'left' + elsif !part.start_with?(":") && part.end_with?(":") + 'right' + else + 'center' + end start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] pos = line[start_pos + 1..]&.index("+") @@ -421,7 +421,8 @@ def parse_pandoc_table_with_spans(pandoc_table) raise "Missing delimiters in previous separator line" if parts.length < cells_content.length - #raise "Missing delimiters in row: #{line}: delimiters = #{cells_content.length}, expected delimiters = #{parts.length}" if parts.length > cells_content.length + # raise "Missing delimiters in row: #{line}: delimiters = #{cells_content.length}, + # expected delimiters = #{parts.length}" if parts.length > cells_content.length column_index = 0 @@ -473,13 +474,13 @@ def parse_pandoc_table_with_spans(pandoc_table) row.each do |cell| next if cell.content.nil? - #cell.content = cell.content.gsub(/^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, - # "\\k\\k") + # cell.content = cell.content.gsub(/(?^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, + # "\\k\\k") - #cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, - # "\\k\\k") + # cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, + # "\\k\\k") # Convert newlines to HTML breaks cell.content = cell.content&.gsub("\n", "
        ") end @@ -488,7 +489,7 @@ def parse_pandoc_table_with_spans(pandoc_table) # Validate grid correctness forward_rowspan = [] - rows.each_with_index do |row, row_index| + rows.each_with_index do |row, _row_index| forward_rowspan = Array.new(row.length, 0) if forward_rowspan.empty? sum = 0 row_forward_rowspan = forward_rowspan.dup @@ -501,6 +502,7 @@ def parse_pandoc_table_with_spans(pandoc_table) forward_rowspan[cell_index] -= 1 end end + if row_forward_rowspan[cell_index] == 0 && cell.rowspan > 1 forward_rowspan[cell_index] = cell.rowspan - 1 colspan = 1 @@ -525,7 +527,16 @@ def generate_html_table_with_spans(pandoc_table) logger = Logger.new($stdout) logger.error("Grid table could not be generated: #{e.message}") - "\n\nHTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOG FILE\n\n#{e.message}\n\nCommit ID: ce3607dbcafafe03531c1c50b3f749cc2318656c\n\n" + <<~MESSAGE + + + HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOG FILE + + #{e.message} + + Commit ID: ce3607dbcafafe03531c1c50b3f749cc2318656c + + MESSAGE else html = '' has_header = false @@ -564,7 +575,13 @@ def generate_html_table_with_spans(pandoc_table) rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %(\n\n#{cell.content}\n\n) + html += <<~TABLE_HEADER + + + #{cell.content} + + + TABLE_HEADER end html += '' end @@ -593,7 +610,13 @@ def generate_html_table_with_spans(pandoc_table) rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" - html += %(\n\n#{cell.content}\n\n) + html += <<~TABLE_DATA + + + #{cell.content} + + + TABLE_DATA end html += '' end -- GitLab From 536fe4adc199b729237450794221376c7594020f Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Fri, 29 Aug 2025 13:16:02 -0500 Subject: [PATCH 11/12] Adding an initial spec file --- .../banzai/filter/grid_table_filter_spec.rb | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 spec/lib/banzai/filter/grid_table_filter_spec.rb diff --git a/spec/lib/banzai/filter/grid_table_filter_spec.rb b/spec/lib/banzai/filter/grid_table_filter_spec.rb new file mode 100644 index 00000000000000..745f9e2dcfa834 --- /dev/null +++ b/spec/lib/banzai/filter/grid_table_filter_spec.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Banzai::Filter::GridTableFilter, feature_category: :markdown do + include FilterSpecHelper + using RSpec::Parameterized::TableSyntax + + def run_pipeline(text, context = { project: nil }) + stub_commonmark_sourcepos_disabled + + Banzai.render_and_post_process(text, context) + end + + context 'when testing just with the filter' do + it 'parses a basic single row table' do + text = <<~TEXT + +-----+-----+-----+ + | A | B | C | + +-----+-----+-----+ + TEXT + + html = <<~HTML +
        + + +
        + + A + + + + B + + + + C + +
        + HTML + + expect(filter(text)).to eq html + end + end + + context 'when testing with the full pipeline' do + it 'parses a basic single row table' do + text = <<~TEXT + +-----+-----+-----+ + | A | B | C | + +-----+-----+-----+ + TEXT + + html = <<~HTML + + + + +
        +

        A

        +
        +

        B

        +
        +

        C

        +
        + HTML + + expect(run_pipeline(text)).to eq html.strip + end + end +end -- GitLab From 7935fc23b22fc34bde1d29bfdd34be2e9bd5627a Mon Sep 17 00:00:00 2001 From: Brett Walker Date: Fri, 29 Aug 2025 13:40:48 -0500 Subject: [PATCH 12/12] Minor rubocop fixes --- lib/banzai/filter/grid_table_filter.rb | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/banzai/filter/grid_table_filter.rb b/lib/banzai/filter/grid_table_filter.rb index ff5869f8c8f226..7515c0c8636583 100644 --- a/lib/banzai/filter/grid_table_filter.rb +++ b/lib/banzai/filter/grid_table_filter.rb @@ -496,20 +496,18 @@ def parse_pandoc_table_with_spans(pandoc_table) row.each_with_index do |cell, cell_index| sum += cell.colspan - if cell.colspan == 0 - if row_forward_rowspan[cell_index] > 0 - sum += 1 - forward_rowspan[cell_index] -= 1 - end + if cell.colspan == 0 && row_forward_rowspan[cell_index] > 0 + sum += 1 + forward_rowspan[cell_index] -= 1 end - if row_forward_rowspan[cell_index] == 0 && cell.rowspan > 1 - forward_rowspan[cell_index] = cell.rowspan - 1 - colspan = 1 - while cell.colspan > colspan - forward_rowspan[cell_index + colspan] = cell.rowspan - 1 - colspan += 1 - end + next unless row_forward_rowspan[cell_index] == 0 && cell.rowspan > 1 + + forward_rowspan[cell_index] = cell.rowspan - 1 + colspan = 1 + while cell.colspan > colspan + forward_rowspan[cell_index + colspan] = cell.rowspan - 1 + colspan += 1 end end @@ -520,6 +518,7 @@ def parse_pandoc_table_with_spans(pandoc_table) [header_rows, data_rows] end + # rubocop:disable Style/RedundantBegin -- PoC def generate_html_table_with_spans(pandoc_table) begin grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) @@ -626,6 +625,7 @@ def generate_html_table_with_spans(pandoc_table) html end end + # rubocop:enable Style/RedundantBegin # rubocop:enable Metrics/PerceivedComplexity # rubocop:enable Metrics/CyclomaticComplexity # rubocop:enable Metrics/AbcSize -- GitLab