From 8f7b39e81d2097c089d8bf50308ca1fb8d4d5a80 Mon Sep 17 00:00:00 2001 From: Eric Masseran Date: Wed, 8 Jan 2025 15:03:54 +0100 Subject: [PATCH 1/6] Thread: use luqum.parser.lexer instead of global one --- luqum/thread.py | 5 +---- tests/alternative_lexer.py | 46 ++++++++++++++++++++++++++++++++++++++ tests/test_thread.py | 20 +++++++++++++++++ 3 files changed, 67 insertions(+), 4 deletions(-) create mode 100644 tests/alternative_lexer.py diff --git a/luqum/thread.py b/luqum/thread.py index 5a88d17..c8fd128 100644 --- a/luqum/thread.py +++ b/luqum/thread.py @@ -1,10 +1,7 @@ import threading -from ply import lex - from . import parser - thread_local = threading.local() @@ -15,5 +12,5 @@ def parse(input=None, lexer=None, debug=False, tracking=False): see: https://github.com/jurismarches/luqum/issues/72 """ if not hasattr(thread_local, "lexer"): - thread_local.lexer = lex.lexer.clone() + thread_local.lexer = parser.lexer.clone() return parser.parser.parse(input, lexer=thread_local.lexer) diff --git a/tests/alternative_lexer.py b/tests/alternative_lexer.py new file mode 100644 index 0000000..6cb4bd0 --- /dev/null +++ b/tests/alternative_lexer.py @@ -0,0 +1,46 @@ +""" +Fake Lexer to test: [Multiple Parsers and +Lexers](http://www.dabeaz.com/ply/ply.html#ply_nn37) +""" + +# List of token names. This is always required +tokens = ( + "NUMBER", + "PLUS", + "MINUS", + "TIMES", + "DIVIDE", + "LPAREN", + "RPAREN", +) + +# Regular expression rules for simple tokens +t_PLUS = r"\+" +t_MINUS = r"-" +t_TIMES = r"\*" +t_DIVIDE = r"/" +t_LPAREN = r"\(" +t_RPAREN = r"\)" + + +# A regular expression rule with some action code +def t_NUMBER(t): + r"\d+" + t.value = int(t.value) + return t + + +# Define a rule so we can track line numbers +def t_newline(t): + r"\n+" + t.lexer.lineno += len(t.value) + + +# A string containing ignored characters (spaces and tabs) +t_ignore = " \t" + + +# Error handling rule +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) diff --git a/tests/test_thread.py b/tests/test_thread.py index 82bf220..dbe73e1 100644 --- a/tests/test_thread.py +++ b/tests/test_thread.py @@ -1,8 +1,11 @@ import queue import threading +import ply.lex as lex + from luqum.parser import parser from luqum.thread import parse +from tests import alternative_lexer def test_thread_parse(): @@ -31,3 +34,20 @@ def run(q): assert result_queue.qsize() == 100 for i in range(100): assert result_queue.get() == expected_tree + + +def test_thread_lex_global_state(): + """ + Last Lexer is used globally by default by the parser. If another library + creates another lexer, it should not impact luqum. + + More info: [Multiple Parsers and + Lexers](http://www.dabeaz.com/ply/ply.html#ply_nn37) + """ + qs = '(title:"foo bar" AND body:"quick fox")' + # before_result = parse(qs) + + lex.lex(module=alternative_lexer) + # if there is a "luqum.exceptions.ParseSyntaxError", the wrong lexer was + # used. + parse(qs) From a609940ea3cfe5e5e3dca4f11f527347a19abf52 Mon Sep 17 00:00:00 2001 From: Eric Masseran Date: Wed, 8 Jan 2025 15:07:50 +0100 Subject: [PATCH 2/6] Document unused params in thread.parse() --- luqum/thread.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/luqum/thread.py b/luqum/thread.py index c8fd128..60d5891 100644 --- a/luqum/thread.py +++ b/luqum/thread.py @@ -8,8 +8,11 @@ def parse(input=None, lexer=None, debug=False, tracking=False): """A (hopefully) thread safe version of :py:meth:`luqum.parser.parse` - PLY is not thread safe because of its lexer state, but cloning it we can be thread safe. - see: https://github.com/jurismarches/luqum/issues/72 + PLY is not thread safe because of its lexer state, but cloning it we can be + thread safe. see: https://github.com/jurismarches/luqum/issues/72 + + Warning: The parameter ``lexer``, ``debug`` and ``tracking`` are not used. + They are still present for signature compatibility. """ if not hasattr(thread_local, "lexer"): thread_local.lexer = parser.lexer.clone() From e9fe1638839774e64b6aa4b23b44b38d511e805d Mon Sep 17 00:00:00 2001 From: Eric Masseran Date: Wed, 8 Jan 2025 15:30:09 +0100 Subject: [PATCH 3/6] Add a safe global parse func + add doc --- luqum/parser.py | 26 ++++++++++++++++++++++++++ tests/test_parser.py | 27 ++++++++++++++++++++++++++- tests/test_thread.py | 1 - 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/luqum/parser.py b/luqum/parser.py index ac252cc..d32d590 100644 --- a/luqum/parser.py +++ b/luqum/parser.py @@ -389,4 +389,30 @@ def p_error(p): **Note**: The parser by itself is not thread safe (because PLY is not). Use :py:func:`luqum.thread.parse` instead + +**Note 2**: This parser is using the last grammar defined globally with PLY. +Another library may update PLY global state. To avoid such issue: + +1. Provide the lexer manually to the parse function: + + .. code:: python + + luqum.parser.parser.parse(raw_lucene_query, lexer=luqum.parser.lexer) + +2. Use :func:`luqum.parser.parse` that sets the correct lexer automatically + +3. Use :py:func:`luqum.thread.parse` that automatically passes the correct lexer + """ + + +def parse(input=None, lexer=lexer, debug=False, tracking=False, tokenfunc=None): + """ + A function to parse a text Lucene query provided as ``input``. + + The function signature is based on :func:`ply.yacc.LRParser.parse` except + that ``lexer`` default to :data:`luqum.parser.lexer`. + """ + return parser.parse( + input=input, lexer=lexer, debug=debug, tracking=tracking, tokenfunc=tokenfunc + ) diff --git a/tests/test_parser.py b/tests/test_parser.py index 42388fa..1c2e46a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,12 +1,16 @@ from decimal import Decimal from unittest import TestCase +import ply.lex as lex +import pytest + from luqum.exceptions import IllegalCharacterError, ParseSyntaxError -from luqum.parser import lexer, parser +from luqum.parser import lexer, parser, parse from luqum.tree import ( SearchField, FieldGroup, Group, Word, Phrase, Regex, Proximity, Fuzzy, Boost, Range, From, To, Not, AndOperation, OrOperation, Plus, Prohibit, UnknownOperation) +from tests import alternative_lexer class TestLexer(TestCase): @@ -526,3 +530,24 @@ def test_illegal_character_exception(self): str(raised.exception), "Illegal character '\\' at position 0", ) + + +def test_lex_global_state(): + """ + Last Lexer is used globally by default by the parser. If another library + creates another lexer, it should not impact luqum. + + More info: [Multiple Parsers and + Lexers](http://www.dabeaz.com/ply/ply.html#ply_nn37) + """ + qs = '(title:"foo bar" AND body:"quick fox")' + + lex.lex(module=alternative_lexer) + + with pytest.raises(ParseSyntaxError): + parser.parse(qs) + + parser.parse(qs, lexer=lexer) + # if there is a "luqum.exceptions.ParseSyntaxError", the wrong lexer was + # used. + parse(qs) diff --git a/tests/test_thread.py b/tests/test_thread.py index dbe73e1..b7deba3 100644 --- a/tests/test_thread.py +++ b/tests/test_thread.py @@ -45,7 +45,6 @@ def test_thread_lex_global_state(): Lexers](http://www.dabeaz.com/ply/ply.html#ply_nn37) """ qs = '(title:"foo bar" AND body:"quick fox")' - # before_result = parse(qs) lex.lex(module=alternative_lexer) # if there is a "luqum.exceptions.ParseSyntaxError", the wrong lexer was From 5745888489e30e578204291d8b97af314c2bee76 Mon Sep 17 00:00:00 2001 From: Eric Masseran Date: Wed, 8 Jan 2025 15:39:22 +0100 Subject: [PATCH 4/6] Polish doc --- docs/source/api.rst | 2 +- luqum/parser.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 27f630b..20a1fe0 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -10,7 +10,7 @@ luqum.parser --------------- .. automodule:: luqum.parser - :members: parser + :members: parser, parse luqum.threading --------------- diff --git a/luqum/parser.py b/luqum/parser.py index d32d590..3941856 100644 --- a/luqum/parser.py +++ b/luqum/parser.py @@ -393,15 +393,16 @@ def p_error(p): **Note 2**: This parser is using the last grammar defined globally with PLY. Another library may update PLY global state. To avoid such issue: -1. Provide the lexer manually to the parse function: +1. Use :func:`luqum.parser.parse` or :py:func:`luqum.thread.parse` that set the + correct lexer automatically - .. code:: python +2. Provide the lexer manually to the parse function: - luqum.parser.parser.parse(raw_lucene_query, lexer=luqum.parser.lexer) + .. code-block:: python -2. Use :func:`luqum.parser.parse` that sets the correct lexer automatically - -3. Use :py:func:`luqum.thread.parse` that automatically passes the correct lexer + luqum.parser.parser.parse( + lucene_query, lexer=luqum.parser.lexer + ) """ From f9fa551a13b4eb814a613027d653ed700f749487 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Sat, 8 Feb 2025 01:47:55 +0100 Subject: [PATCH 5/6] feat: force lexer in parser.parse --- luqum/parser.py | 8 +++++++- tests/test_parser.py | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/luqum/parser.py b/luqum/parser.py index 7f6e153..e7b0a18 100644 --- a/luqum/parser.py +++ b/luqum/parser.py @@ -428,6 +428,8 @@ def p_error(p): """ +_orig_parse = parser.parse + def parse(input=None, lexer=lexer, debug=False, tracking=False, tokenfunc=None): """ @@ -436,6 +438,10 @@ def parse(input=None, lexer=lexer, debug=False, tracking=False, tokenfunc=None): The function signature is based on :func:`ply.yacc.LRParser.parse` except that ``lexer`` default to :data:`luqum.parser.lexer`. """ - return parser.parse( + return _orig_parse( input=input, lexer=lexer, debug=debug, tracking=tracking, tokenfunc=tokenfunc ) + + +# avoid confusion in lexers by monkey patching +parser.parse = parse diff --git a/tests/test_parser.py b/tests/test_parser.py index 3d0f290..fb17d06 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -556,9 +556,9 @@ def test_lex_global_state(): lex.lex(module=alternative_lexer) with pytest.raises(ParseSyntaxError): - parser.parse(qs) + parser.parse(qs, lexer=lex.lexer) - parser.parse(qs, lexer=lexer) - # if there is a "luqum.exceptions.ParseSyntaxError", the wrong lexer was - # used. + # if there is a "luqum.exceptions.ParseSyntaxError", + # the wrong lexer was used. parse(qs) + parser.parse(qs) From 99662a3b103492d433b3b9f7d79ed3efeeacafce Mon Sep 17 00:00:00 2001 From: Eric Masseran Date: Wed, 12 Feb 2025 17:24:57 +0100 Subject: [PATCH 6/6] Remove Note 2 --- luqum/parser.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/luqum/parser.py b/luqum/parser.py index e7b0a18..900f092 100644 --- a/luqum/parser.py +++ b/luqum/parser.py @@ -411,21 +411,6 @@ def p_error(p): **Note**: The parser by itself is not thread safe (because PLY is not). Use :py:func:`luqum.thread.parse` instead - -**Note 2**: This parser is using the last grammar defined globally with PLY. -Another library may update PLY global state. To avoid such issue: - -1. Use :func:`luqum.parser.parse` or :py:func:`luqum.thread.parse` that set the - correct lexer automatically - -2. Provide the lexer manually to the parse function: - - .. code-block:: python - - luqum.parser.parser.parse( - lucene_query, lexer=luqum.parser.lexer - ) - """ _orig_parse = parser.parse