From 4732c99b6498c9eef2d016d691857164965ea813 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Tue, 15 Mar 2022 00:45:34 -0300 Subject: [PATCH 01/13] comecando o projeto --- .gitignore | 2 ++ composer.json | 5 +++++ 2 files changed, 7 insertions(+) create mode 100644 composer.json diff --git a/.gitignore b/.gitignore index 9f11b75..cc41b59 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ .idea/ +/vendor +*.lock \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..d1c88fb --- /dev/null +++ b/composer.json @@ -0,0 +1,5 @@ +{ + "require":{ + "slim/slim": "2.0" + } +} \ No newline at end of file From 10c59f87dab794866480a32e46f04e41e9030cd2 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Tue, 15 Mar 2022 01:49:42 -0300 Subject: [PATCH 02/13] adicionando base de pastas e arquivos --- .gitignore | 3 ++- DB_connection.php. | 13 +++++++++++++ PROJETO.md | 3 +++ composer.json | 4 +++- src/data_base/create_DB.php | 0 src/data_base/get_DB_values.php | 0 src/index.php | 0 src/router/routes.php | 0 src/scraping/scrap_values.php | 0 src/tests/DB_tests.php | 0 src/tests/get_tests.php | 0 src/tests/scrap_tests.php | 0 12 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 DB_connection.php. create mode 100644 PROJETO.md create mode 100644 src/data_base/create_DB.php create mode 100644 src/data_base/get_DB_values.php create mode 100644 src/index.php create mode 100644 src/router/routes.php create mode 100644 src/scraping/scrap_values.php create mode 100644 src/tests/DB_tests.php create mode 100644 src/tests/get_tests.php create mode 100644 src/tests/scrap_tests.php diff --git a/.gitignore b/.gitignore index cc41b59..6afd6dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea/ /vendor -*.lock \ No newline at end of file +*.lock +*DB_connection.php \ No newline at end of file diff --git a/DB_connection.php. b/DB_connection.php. new file mode 100644 index 0000000..a369d1f --- /dev/null +++ b/DB_connection.php. @@ -0,0 +1,13 @@ +getMessage(); +} \ No newline at end of file diff --git a/PROJETO.md b/PROJETO.md new file mode 100644 index 0000000..4824fd4 --- /dev/null +++ b/PROJETO.md @@ -0,0 +1,3 @@ +sudo apt-get install php-xml +sudo apt-get install php-mbstring +sudo apt-get install -y php-mysqli diff --git a/composer.json b/composer.json index d1c88fb..8a0843f 100644 --- a/composer.json +++ b/composer.json @@ -1,5 +1,7 @@ { "require":{ - "slim/slim": "2.0" + "slim/slim": "2.0", + "phpunit/phpunit": "^9" + } } \ No newline at end of file diff --git a/src/data_base/create_DB.php b/src/data_base/create_DB.php new file mode 100644 index 0000000..e69de29 diff --git a/src/data_base/get_DB_values.php b/src/data_base/get_DB_values.php new file mode 100644 index 0000000..e69de29 diff --git a/src/index.php b/src/index.php new file mode 100644 index 0000000..e69de29 diff --git a/src/router/routes.php b/src/router/routes.php new file mode 100644 index 0000000..e69de29 diff --git a/src/scraping/scrap_values.php b/src/scraping/scrap_values.php new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/DB_tests.php b/src/tests/DB_tests.php new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/get_tests.php b/src/tests/get_tests.php new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/scrap_tests.php b/src/tests/scrap_tests.php new file mode 100644 index 0000000..e69de29 From 4279b74c892bfab20abe3f97c9499582ffe8f5ca Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Tue, 15 Mar 2022 15:31:58 -0300 Subject: [PATCH 03/13] adicionando a coneccao com o banco de dados mysql --- .env.example | 4 ++++ .gitignore | 2 +- DB_connection.php. | 13 ------------- composer.json | 4 ++-- oba.txt | 1 + scriptSQL.sql | 12 ++++++++++++ src/data_base/connection.php | 15 +++++++++++++++ .../{create_DB.php => insert_DB_values.php} | 0 8 files changed, 35 insertions(+), 16 deletions(-) create mode 100644 .env.example delete mode 100644 DB_connection.php. create mode 100644 oba.txt create mode 100644 scriptSQL.sql create mode 100644 src/data_base/connection.php rename src/data_base/{create_DB.php => insert_DB_values.php} (100%) diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..64e8e6a --- /dev/null +++ b/.env.example @@ -0,0 +1,4 @@ +MYSQL_HOST=localhost +MYSQL_PORT=3306 +MYSQL_USER=root +MYSQL_PASSWORD=senhaDoDB \ No newline at end of file diff --git a/.gitignore b/.gitignore index 6afd6dd..9e35871 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ .idea/ /vendor *.lock -*DB_connection.php \ No newline at end of file +*.env \ No newline at end of file diff --git a/DB_connection.php. b/DB_connection.php. deleted file mode 100644 index a369d1f..0000000 --- a/DB_connection.php. +++ /dev/null @@ -1,13 +0,0 @@ -getMessage(); -} \ No newline at end of file diff --git a/composer.json b/composer.json index 8a0843f..e8a8c0e 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,7 @@ { "require":{ "slim/slim": "2.0", - "phpunit/phpunit": "^9" - + "phpunit/phpunit": "^9", + "vlucas/phpdotenv": "^5.0" } } \ No newline at end of file diff --git a/oba.txt b/oba.txt new file mode 100644 index 0000000..ca7d7ed --- /dev/null +++ b/oba.txt @@ -0,0 +1 @@ +sadfsadsa \ No newline at end of file diff --git a/scriptSQL.sql b/scriptSQL.sql new file mode 100644 index 0000000..711d4a0 --- /dev/null +++ b/scriptSQL.sql @@ -0,0 +1,12 @@ +CREATE DATABASE IF NOT EXISTS raspagem_despesas; +USE raspagem_despesas; +CREATE TABLE IF NOT EXISTS info( +id INT AUTO_INCREMENT NOT NULL PRIMARY KEY, +mes_ano VARCHAR(7), +orgao_superior VARCHAR(60), +entidade_vinculada VARCHAR(100), +valor_empenhado FLOAT, +valor_liquidado FLOAT, +valor_pago FLOAT, +valor_restos_a_pagar_pagos FLOAT +) \ No newline at end of file diff --git a/src/data_base/connection.php b/src/data_base/connection.php new file mode 100644 index 0000000..efc538a --- /dev/null +++ b/src/data_base/connection.php @@ -0,0 +1,15 @@ +load(); +$mysql_host = $_ENV['MYSQL_HOST']; +$mysql_user = $_ENV['MYSQL_USER']; +$mysql_pass = $_ENV['MYSQL_PASSWORD']; +$mysql_db = "raspagem_despesas"; + +try{ +$conn = mysqli_connect($mysql_host, $mysql_user, $mysql_pass, $mysql_db); +echo "conectado"; +}catch(Exception $e){ + echo "Connection failed: " . $e->getMessage(); +} \ No newline at end of file diff --git a/src/data_base/create_DB.php b/src/data_base/insert_DB_values.php similarity index 100% rename from src/data_base/create_DB.php rename to src/data_base/insert_DB_values.php From 4b76600c95cee926f5ec31525e8e5a5088d72c89 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Tue, 15 Mar 2022 16:34:23 -0300 Subject: [PATCH 04/13] tranformando o connection em classe --- oba.txt | 1 - src/data_base/connection.php | 46 ++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 14 deletions(-) delete mode 100644 oba.txt diff --git a/oba.txt b/oba.txt deleted file mode 100644 index ca7d7ed..0000000 --- a/oba.txt +++ /dev/null @@ -1 +0,0 @@ -sadfsadsa \ No newline at end of file diff --git a/src/data_base/connection.php b/src/data_base/connection.php index efc538a..b29662e 100644 --- a/src/data_base/connection.php +++ b/src/data_base/connection.php @@ -1,15 +1,35 @@ -load(); -$mysql_host = $_ENV['MYSQL_HOST']; -$mysql_user = $_ENV['MYSQL_USER']; -$mysql_pass = $_ENV['MYSQL_PASSWORD']; -$mysql_db = "raspagem_despesas"; - -try{ -$conn = mysqli_connect($mysql_host, $mysql_user, $mysql_pass, $mysql_db); -echo "conectado"; -}catch(Exception $e){ - echo "Connection failed: " . $e->getMessage(); +// Documentação e repositório da biblioteca phpdotenv: https://github.com/vlucas/phpdotenv +class Connection +{ + private $host = ""; + private $db_name = "raspagem_despesas"; + private $username = ""; + private $password = ""; + public $conn; + + public function __construct() + { + $dotenv = Dotenv\Dotenv::createImmutable(__DIR__ . '/../../'); + $dotenv->load(); + $this->host = $_ENV['MYSQL_HOST'] . ":" . $_ENV['MYSQL_PORT']; + $this->username = $_ENV['MYSQL_USER']; + $this->password = $_ENV['MYSQL_PASSWORD']; + } + + public function getConnection() + { + $this->conn = null; + + try { + $this->conn = new PDO("mysql:host=" . $this->host . ";dbname=" . $this->db_name, $this->username, $this->password); + $this->conn->exec("set names utf8"); + + } catch (PDOException $exception) { + echo "Connection error: " . $exception->getMessage(); + } + + return $this->conn; + } } \ No newline at end of file From ce64c4c67ae0b385eb0c29f63b9a8ff5835f17ad Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Tue, 15 Mar 2022 16:47:31 -0300 Subject: [PATCH 05/13] passando de snake_case para camelCase --- src/data_base/{get_DB_values.php => getDbValues.php} | 0 .../{insert_DB_values.php => insertDbValues.php} | 0 src/scraping/{scrap_values.php => scrapValues.php} | 0 src/tests/DbTests.php | 10 ++++++++++ src/tests/UnitTest.php | 2 ++ src/tests/{DB_tests.php => getTests.php} | 0 src/tests/{get_tests.php => scrapTests.php} | 0 src/tests/scrap_tests.php | 0 8 files changed, 12 insertions(+) rename src/data_base/{get_DB_values.php => getDbValues.php} (100%) rename src/data_base/{insert_DB_values.php => insertDbValues.php} (100%) rename src/scraping/{scrap_values.php => scrapValues.php} (100%) create mode 100644 src/tests/DbTests.php create mode 100644 src/tests/UnitTest.php rename src/tests/{DB_tests.php => getTests.php} (100%) rename src/tests/{get_tests.php => scrapTests.php} (100%) delete mode 100644 src/tests/scrap_tests.php diff --git a/src/data_base/get_DB_values.php b/src/data_base/getDbValues.php similarity index 100% rename from src/data_base/get_DB_values.php rename to src/data_base/getDbValues.php diff --git a/src/data_base/insert_DB_values.php b/src/data_base/insertDbValues.php similarity index 100% rename from src/data_base/insert_DB_values.php rename to src/data_base/insertDbValues.php diff --git a/src/scraping/scrap_values.php b/src/scraping/scrapValues.php similarity index 100% rename from src/scraping/scrap_values.php rename to src/scraping/scrapValues.php diff --git a/src/tests/DbTests.php b/src/tests/DbTests.php new file mode 100644 index 0000000..96f97b8 --- /dev/null +++ b/src/tests/DbTests.php @@ -0,0 +1,10 @@ +assertTrue(true); + } +} \ No newline at end of file diff --git a/src/tests/UnitTest.php b/src/tests/UnitTest.php new file mode 100644 index 0000000..f7905ed --- /dev/null +++ b/src/tests/UnitTest.php @@ -0,0 +1,2 @@ + Date: Tue, 15 Mar 2022 18:49:53 -0300 Subject: [PATCH 06/13] =?UTF-8?q?desistindo=20dos=20testes=20pois=20n?= =?UTF-8?q?=C3=A3o=20consegui=20achar=20a=20solu=C3=A7=C3=A3o=20para=20o?= =?UTF-8?q?=20include=5Fpath=20do=20php?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- composer.json | 1 - src/{data_base => dataBase}/connection.php | 1 - src/{data_base => dataBase}/getDbValues.php | 0 src/{data_base => dataBase}/insertDbValues.php | 0 src/tests/DbTests.php | 10 ---------- src/tests/UnitTest.php | 2 -- src/tests/getTests.php | 0 src/tests/scrapTests.php | 0 8 files changed, 14 deletions(-) rename src/{data_base => dataBase}/connection.php (98%) rename src/{data_base => dataBase}/getDbValues.php (100%) rename src/{data_base => dataBase}/insertDbValues.php (100%) delete mode 100644 src/tests/DbTests.php delete mode 100644 src/tests/UnitTest.php delete mode 100644 src/tests/getTests.php delete mode 100644 src/tests/scrapTests.php diff --git a/composer.json b/composer.json index e8a8c0e..caffcff 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,6 @@ { "require":{ "slim/slim": "2.0", - "phpunit/phpunit": "^9", "vlucas/phpdotenv": "^5.0" } } \ No newline at end of file diff --git a/src/data_base/connection.php b/src/dataBase/connection.php similarity index 98% rename from src/data_base/connection.php rename to src/dataBase/connection.php index b29662e..b29f8c7 100644 --- a/src/data_base/connection.php +++ b/src/dataBase/connection.php @@ -25,7 +25,6 @@ public function getConnection() try { $this->conn = new PDO("mysql:host=" . $this->host . ";dbname=" . $this->db_name, $this->username, $this->password); $this->conn->exec("set names utf8"); - } catch (PDOException $exception) { echo "Connection error: " . $exception->getMessage(); } diff --git a/src/data_base/getDbValues.php b/src/dataBase/getDbValues.php similarity index 100% rename from src/data_base/getDbValues.php rename to src/dataBase/getDbValues.php diff --git a/src/data_base/insertDbValues.php b/src/dataBase/insertDbValues.php similarity index 100% rename from src/data_base/insertDbValues.php rename to src/dataBase/insertDbValues.php diff --git a/src/tests/DbTests.php b/src/tests/DbTests.php deleted file mode 100644 index 96f97b8..0000000 --- a/src/tests/DbTests.php +++ /dev/null @@ -1,10 +0,0 @@ -assertTrue(true); - } -} \ No newline at end of file diff --git a/src/tests/UnitTest.php b/src/tests/UnitTest.php deleted file mode 100644 index f7905ed..0000000 --- a/src/tests/UnitTest.php +++ /dev/null @@ -1,2 +0,0 @@ - Date: Wed, 16 Mar 2022 15:59:49 -0300 Subject: [PATCH 07/13] adicionando o HeadLess e removendo o componente de classe --- composer.json | 3 ++- src/dataBase/connection.php | 47 ++++++++++++----------------------- src/scraping/filterValues.php | 3 +++ src/scraping/scrapValues.php | 26 +++++++++++++++++++ 4 files changed, 47 insertions(+), 32 deletions(-) create mode 100644 src/scraping/filterValues.php diff --git a/composer.json b/composer.json index caffcff..c705ad9 100644 --- a/composer.json +++ b/composer.json @@ -1,6 +1,7 @@ { "require":{ "slim/slim": "2.0", - "vlucas/phpdotenv": "^5.0" + "vlucas/phpdotenv": "^5.0", + "chrome-php/chrome": "^1.0" } } \ No newline at end of file diff --git a/src/dataBase/connection.php b/src/dataBase/connection.php index b29f8c7..b7c8acd 100644 --- a/src/dataBase/connection.php +++ b/src/dataBase/connection.php @@ -1,34 +1,19 @@ -load(); +$mysql_host = $_ENV['MYSQL_HOST']; +$mysql_user = $_ENV['MYSQL_USER']; +$mysql_pass = $_ENV['MYSQL_PASSWORD']; +$mysql_db = "raspagem_despesas"; - public function __construct() - { - $dotenv = Dotenv\Dotenv::createImmutable(__DIR__ . '/../../'); - $dotenv->load(); - $this->host = $_ENV['MYSQL_HOST'] . ":" . $_ENV['MYSQL_PORT']; - $this->username = $_ENV['MYSQL_USER']; - $this->password = $_ENV['MYSQL_PASSWORD']; +function getMysqlConnection(){ + global $mysql_host, $mysql_user, $mysql_pass, $mysql_db; + + $conn = new mysqli($mysql_host, $mysql_user, $mysql_pass, $mysql_db); + + if ($conn->connect_error) { + die("Connection failed: " . $conn->connect_error); } - - public function getConnection() - { - $this->conn = null; - - try { - $this->conn = new PDO("mysql:host=" . $this->host . ";dbname=" . $this->db_name, $this->username, $this->password); - $this->conn->exec("set names utf8"); - } catch (PDOException $exception) { - echo "Connection error: " . $exception->getMessage(); - } - - return $this->conn; - } -} \ No newline at end of file + return $conn; + } \ No newline at end of file diff --git a/src/scraping/filterValues.php b/src/scraping/filterValues.php new file mode 100644 index 0000000..b867d49 --- /dev/null +++ b/src/scraping/filterValues.php @@ -0,0 +1,3 @@ +createBrowser(); + + + try { + // creates a new page and navigate to an URL + $page = $browser->createPage(); + $page->navigate($url)->waitForNavigation(); + + return $page + ->evaluate('document.getElementsByClassName("coluna-livre")') + ->getReturnValue(); + } finally { + // bye + $browser->close(); + } +} + From 715f2588fd869305543197221ae6740ce6153167 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Wed, 16 Mar 2022 18:37:48 -0300 Subject: [PATCH 08/13] filtrando os dados recebidos --- src/scraping/filterValues.php | 15 +++++++++++++-- src/scraping/scrapValues.php | 20 ++++++++++++++------ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/scraping/filterValues.php b/src/scraping/filterValues.php index b867d49..5a4fcf5 100644 --- a/src/scraping/filterValues.php +++ b/src/scraping/filterValues.php @@ -1,3 +1,14 @@ createPage(); $page->navigate($url)->waitForNavigation(); - - return $page - ->evaluate('document.getElementsByClassName("coluna-livre")') - ->getReturnValue(); + sleep(1); + $value = $page + ->evaluate("const parents = document.querySelectorAll('.coluna-livre'); + const values = []; + for (let i = 0; i < parents.length; i++) { + values.push(parents[i].innerText); + }; + values;") + ->getReturnValue(); + return $value; } finally { // bye $browser->close(); } -} - +} \ No newline at end of file From 860d2c8e7e87da74d72b882f15fd9301c15c16b0 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Thu, 17 Mar 2022 02:07:13 -0300 Subject: [PATCH 09/13] filtrando os dados e inserindo no mysql --- scriptSQL.sql | 2 +- src/dataBase/connection.php | 2 +- src/dataBase/insertDbValues.php | 16 +++++++++++ src/scraping/filterValues.php | 51 +++++++++++++++++++++++++++++---- src/scraping/scrapValues.php | 24 ++++++++++++---- 5 files changed, 82 insertions(+), 13 deletions(-) diff --git a/scriptSQL.sql b/scriptSQL.sql index 711d4a0..d09b0da 100644 --- a/scriptSQL.sql +++ b/scriptSQL.sql @@ -3,7 +3,7 @@ USE raspagem_despesas; CREATE TABLE IF NOT EXISTS info( id INT AUTO_INCREMENT NOT NULL PRIMARY KEY, mes_ano VARCHAR(7), -orgao_superior VARCHAR(60), +orgao_superior VARCHAR(100), entidade_vinculada VARCHAR(100), valor_empenhado FLOAT, valor_liquidado FLOAT, diff --git a/src/dataBase/connection.php b/src/dataBase/connection.php index b7c8acd..c1e9e35 100644 --- a/src/dataBase/connection.php +++ b/src/dataBase/connection.php @@ -5,7 +5,7 @@ $mysql_host = $_ENV['MYSQL_HOST']; $mysql_user = $_ENV['MYSQL_USER']; $mysql_pass = $_ENV['MYSQL_PASSWORD']; -$mysql_db = "raspagem_despesas"; +$mysql_db = "raspagem_dados"; function getMysqlConnection(){ global $mysql_host, $mysql_user, $mysql_pass, $mysql_db; diff --git a/src/dataBase/insertDbValues.php b/src/dataBase/insertDbValues.php index e69de29..b4c990e 100644 --- a/src/dataBase/insertDbValues.php +++ b/src/dataBase/insertDbValues.php @@ -0,0 +1,16 @@ +prepare("INSERT INTO info (mes_ano,orgao_superior,entidade_vinculada,valor_empenhado,valor_liquidado,valor_pago,valor_restos_a_pagar_pagos) VALUES (?,?,?,?,?,?,?)"); + + +for ($i=0; $i < count($formattedValues[0]); $i++) { + $stmt->bind_param("sssdddd", $formattedValues[0][$i], $formattedValues[1][$i],$formattedValues[2][$i], + $formattedValues[3][$i], $formattedValues[4][$i], $formattedValues[5][$i], $formattedValues[6][$i]); + $stmt->execute(); + +} \ No newline at end of file diff --git a/src/scraping/filterValues.php b/src/scraping/filterValues.php index 5a4fcf5..791b1e4 100644 --- a/src/scraping/filterValues.php +++ b/src/scraping/filterValues.php @@ -1,9 +1,14 @@ - $value) { + $items[$key] = str_replace(".", "", $value); + $items[$key] = str_replace(" ", "", $value); + $items[$key] = str_replace(",", ".", $items[$key]); + } + return $items; +} + +function separatedValuesInVariables($items) +{ + global $date, $agency, $entities, $committedValue, $liquidatedValue, $paidValue, $unpaidValue; + $value = formatValues($items); + + for ($i = 0; $i < count($value); $i++) { + if ($i % 7 == 0) { + $date[] = $value[$i]; + } elseif ($i % 7 == 1) { + $agency[] = $value[$i]; + } elseif ($i % 7 == 2) { + $entities[] = $value[$i]; + } elseif ($i % 7 == 3) { + $committedValue[] = $value[$i]; + } elseif ($i % 7 == 4) { + $liquidatedValue[] = $value[$i]; + } elseif ($i % 7 == 5) { + $paidValue[] = $value[$i]; + } elseif ($i % 7 == 6) { + $unpaidValue[] = $value[$i]; + } + } + return [$date, $agency, $entities, $committedValue, $liquidatedValue, $paidValue, $unpaidValue]; } \ No newline at end of file diff --git a/src/scraping/scrapValues.php b/src/scraping/scrapValues.php index 6e58997..f020ef6 100644 --- a/src/scraping/scrapValues.php +++ b/src/scraping/scrapValues.php @@ -18,13 +18,25 @@ function getScrapValues(){ $page = $browser->createPage(); $page->navigate($url)->waitForNavigation(); sleep(1); + //abaixo um codigo em js que pega as tabelas e desestrutura elas + //até retornar um array apenas os textos contidos. $value = $page - ->evaluate("const parents = document.querySelectorAll('.coluna-livre'); - const values = []; - for (let i = 0; i < parents.length; i++) { - values.push(parents[i].innerText); - }; - values;") + ->evaluate("const even = document.querySelectorAll('.even'); + const odd = document.querySelectorAll('.odd'); + const firstValues = []; + odd.forEach((i) => firstValues.push(i)) + even.forEach((i) => firstValues.push(i)) + const cellValues = []; + firstValues.forEach((i) => cellValues.push(i.cells)) + textValues = []; + for (let i = 0; i < cellValues.length; i++) + { + for (let u = 0; u < cellValues[i].length; u++) + { + textValues.push(cellValues[i][u].textContent) + } + } + textValues;") ->getReturnValue(); return $value; } finally { From 299047149e4898b71152137154b1a2bd0db5b121 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Thu, 17 Mar 2022 16:47:01 -0300 Subject: [PATCH 10/13] adicionando rota amigavel --- .htaccess | 4 ++++ index.php | 9 +++++++++ src/dataBase/connection.php | 14 ++++++++++++-- src/dataBase/getDbValues.php | 7 +++++++ src/dataBase/insertDbValues.php | 6 +++--- src/router/routes.php | 0 src/routes/routes.php | 15 +++++++++++++++ src/scraping/scrapValues.php | 3 +-- 8 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 .htaccess create mode 100644 index.php delete mode 100644 src/router/routes.php create mode 100644 src/routes/routes.php diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..7d31f2b --- /dev/null +++ b/.htaccess @@ -0,0 +1,4 @@ +RewriteEngine on +RewriteCond %{SCRIPT_FILENAME} !-f +RewriteCond %{SCRIPT_FILENAME} !-d +RewriteRule ^(.*)$ index.php?url=$1 [L]0 \ No newline at end of file diff --git a/index.php b/index.php new file mode 100644 index 0000000..18b8eba --- /dev/null +++ b/index.php @@ -0,0 +1,9 @@ +load(); $mysql_host = $_ENV['MYSQL_HOST']; @@ -10,7 +18,9 @@ function getMysqlConnection(){ global $mysql_host, $mysql_user, $mysql_pass, $mysql_db; - $conn = new mysqli($mysql_host, $mysql_user, $mysql_pass, $mysql_db); + $conn = new mysqli($mysql_host, $mysql_user, $mysql_pass, $mysql_db,); + mysqli_query($conn,"SET CHARACTER SET 'utf8'"); + mysqli_query($conn,"SET SESSION collation_connection ='utf8_unicode_ci'"); if ($conn->connect_error) { die("Connection failed: " . $conn->connect_error); diff --git a/src/dataBase/getDbValues.php b/src/dataBase/getDbValues.php index e69de29..a3e061d 100644 --- a/src/dataBase/getDbValues.php +++ b/src/dataBase/getDbValues.php @@ -0,0 +1,7 @@ +query($sql); + $res = json_encode($result->fetch_all(MYSQLI_ASSOC)); + return $res; +} \ No newline at end of file diff --git a/src/dataBase/insertDbValues.php b/src/dataBase/insertDbValues.php index b4c990e..935590e 100644 --- a/src/dataBase/insertDbValues.php +++ b/src/dataBase/insertDbValues.php @@ -1,7 +1,7 @@ get('/', function () { + echo 'home'; +}); +$app->get('/api/dados', function () { + $conn = getMysqlConnection(); + $values = getMysqlValues($conn); + echo $values; +}); +$app->run(); \ No newline at end of file diff --git a/src/scraping/scrapValues.php b/src/scraping/scrapValues.php index f020ef6..2fbf0b0 100644 --- a/src/scraping/scrapValues.php +++ b/src/scraping/scrapValues.php @@ -9,12 +9,11 @@ function getScrapValues(){ $url = 'https://www.transparencia.gov.br/despesas/orgao?ordenarPor=orgaoSuperior&direcao=asc'; $browserFactory = new BrowserFactory('google-chrome'); -// starts headless chrome $browser = $browserFactory->createBrowser(); try { - // creates a new page and navigate to an URL + // cria uma nova pagina e navega até ela $page = $browser->createPage(); $page->navigate($url)->waitForNavigation(); sleep(1); From 8ca73cbbc5626c30db07cfdc8d84b9536b64e117 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Thu, 17 Mar 2022 21:32:01 -0300 Subject: [PATCH 11/13] pegando todos os valores da tabela e facilitando para inserir e criar um banco de dados --- scriptSQL.sql | 3 ++- src/dataBase/connection.php | 5 ++-- src/dataBase/createDB.php | 21 +++++++++++++++++ src/dataBase/insertDbValues.php | 39 +++++++++++++++++++++--------- src/routes/routes.php | 16 ++++++++++++- src/scraping/scrapValues.php | 42 ++++++++++++++++++--------------- 6 files changed, 92 insertions(+), 34 deletions(-) create mode 100644 src/dataBase/createDB.php diff --git a/scriptSQL.sql b/scriptSQL.sql index d09b0da..5604a3e 100644 --- a/scriptSQL.sql +++ b/scriptSQL.sql @@ -1,4 +1,5 @@ -CREATE DATABASE IF NOT EXISTS raspagem_despesas; +DROP DATABASE IF EXISTS raspagem_despesas; +CREATE DATABASE raspagem_despesas; USE raspagem_despesas; CREATE TABLE IF NOT EXISTS info( id INT AUTO_INCREMENT NOT NULL PRIMARY KEY, diff --git a/src/dataBase/connection.php b/src/dataBase/connection.php index e707a30..193eed6 100644 --- a/src/dataBase/connection.php +++ b/src/dataBase/connection.php @@ -7,14 +7,15 @@ - +// pega os valores do arquivo .env $dotenv = Dotenv\Dotenv::createImmutable(__DIR__ . '/../../'); $dotenv->load(); $mysql_host = $_ENV['MYSQL_HOST']; $mysql_user = $_ENV['MYSQL_USER']; $mysql_pass = $_ENV['MYSQL_PASSWORD']; -$mysql_db = "raspagem_dados"; +$mysql_db = "raspagem_despesas"; +//se conecta com o banco de dados function getMysqlConnection(){ global $mysql_host, $mysql_user, $mysql_pass, $mysql_db; diff --git a/src/dataBase/createDB.php b/src/dataBase/createDB.php new file mode 100644 index 0000000..3cdf34c --- /dev/null +++ b/src/dataBase/createDB.php @@ -0,0 +1,21 @@ +multi_query($sql); + +} \ No newline at end of file diff --git a/src/dataBase/insertDbValues.php b/src/dataBase/insertDbValues.php index 935590e..18394b8 100644 --- a/src/dataBase/insertDbValues.php +++ b/src/dataBase/insertDbValues.php @@ -1,16 +1,33 @@ prepare("INSERT INTO info (mes_ano,orgao_superior,entidade_vinculada,valor_empenhado,valor_liquidado,valor_pago,valor_restos_a_pagar_pagos) VALUES (?,?,?,?,?,?,?)"); +require_once("./src/scraping/filterValues.php"); +require_once("./src/scraping/scrapValues.php"); -for ($i=0; $i < count($formattedValues[0]); $i++) { - $stmt->bind_param("sssdddd", $formattedValues[0][$i], $formattedValues[1][$i],$formattedValues[2][$i], - $formattedValues[3][$i], $formattedValues[4][$i], $formattedValues[5][$i], $formattedValues[6][$i]); - $stmt->execute(); +function getValues() +{ + $items = getScrapValues(); + $formattedValues = separatedValuesInVariables($items); + return $formattedValues; +} +function insertInDb($conn) +{ + $formattedValues = getValues(); + $sql = "INSERT INTO info (mes_ano,orgao_superior,entidade_vinculada,valor_empenhado, + valor_liquidado,valor_pago,valor_restos_a_pagar_pagos) VALUES (?,?,?,?,?,?,?)"; + $stmt = $conn->prepare($sql); + + for ($i=0; $i < count($formattedValues[0]); $i++) { + $stmt->bind_param( + "sssdddd", + $formattedValues[0][$i], + $formattedValues[1][$i], + $formattedValues[2][$i], + $formattedValues[3][$i], + $formattedValues[4][$i], + $formattedValues[5][$i], + $formattedValues[6][$i] + ); + $stmt->execute(); + } } \ No newline at end of file diff --git a/src/routes/routes.php b/src/routes/routes.php index 0d0d514..2a3fce4 100644 --- a/src/routes/routes.php +++ b/src/routes/routes.php @@ -3,13 +3,27 @@ require_once("./vendor/autoload.php"); require_once("./src/dataBase/getDbValues.php"); require_once("./src/dataBase/connection.php"); +require_once("./src/dataBase/createDB.php"); +require_once("./src/dataBase/insertDbValues.php"); $app = new \Slim\Slim(); +$conn = getMysqlConnection(); $app->get('/', function () { echo 'home'; }); $app->get('/api/dados', function () { - $conn = getMysqlConnection(); + global $conn; $values = getMysqlValues($conn); echo $values; }); +$app->get('/DB/create', function () { + global $conn; + createDB($conn); + echo "DB created"; +}); +$app->get('/DB/populate', function () { + global $conn; + insertInDb($conn); + echo "DB populated"; +}); + $app->run(); \ No newline at end of file diff --git a/src/scraping/scrapValues.php b/src/scraping/scrapValues.php index 2fbf0b0..16240ae 100644 --- a/src/scraping/scrapValues.php +++ b/src/scraping/scrapValues.php @@ -3,41 +3,45 @@ // O uso do HeadLess foi uma dica do Israel Brito, agradeco muito use HeadlessChromium\BrowserFactory; -require_once '../../vendor/autoload.php'; +require_once './vendor/autoload.php'; function getScrapValues(){ $url = 'https://www.transparencia.gov.br/despesas/orgao?ordenarPor=orgaoSuperior&direcao=asc'; $browserFactory = new BrowserFactory('google-chrome'); $browser = $browserFactory->createBrowser(); - try { // cria uma nova pagina e navega até ela $page = $browser->createPage(); $page->navigate($url)->waitForNavigation(); sleep(1); + + //abaixo um codigo em js que pega as tabelas e desestrutura elas //até retornar um array apenas os textos contidos. $value = $page - ->evaluate("const even = document.querySelectorAll('.even'); - const odd = document.querySelectorAll('.odd'); - const firstValues = []; - odd.forEach((i) => firstValues.push(i)) - even.forEach((i) => firstValues.push(i)) - const cellValues = []; - firstValues.forEach((i) => cellValues.push(i.cells)) - textValues = []; - for (let i = 0; i < cellValues.length; i++) - { - for (let u = 0; u < cellValues[i].length; u++) - { - textValues.push(cellValues[i][u].textContent) - } - } - textValues;") + ->evaluate( + "textValues = []; for (let i = 0; i < 46; i++){let even = + document.querySelectorAll('.even'); + let odd = document.querySelectorAll('.odd'); + let firstValues = []; + odd.forEach((i) => firstValues.push(i)) + even.forEach((i) => firstValues.push(i)) + let cellValues = []; + firstValues.forEach((i) => cellValues.push(i.cells)); + for (let i = 0; i < cellValues.length; i++) + { + for (let u = 0; u < cellValues[i].length; u++) + { + textValues.push(cellValues[i][u].textContent) + } + } + let but = document.querySelectorAll('.paginate_button')[1]; + but.click();} + textValues;") ->getReturnValue(); - return $value; + return $value; } finally { // bye $browser->close(); From 310f69693e22bf4910e5a141b2384193edffa5e2 Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Fri, 18 Mar 2022 06:40:03 -0300 Subject: [PATCH 12/13] =?UTF-8?q?adicionando=20a=20documenta=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 5 +- PROJETO.md | 98 +++++++++++++++++++++++++++++++++++ index.php | 1 + src/dataBase/connection.php | 17 +++--- src/dataBase/createDB.php | 8 +-- src/dataBase/getDbValues.php | 5 +- src/index.php | 0 src/routes/routes.php | 32 +++++++++--- src/scraping/filterValues.php | 2 + src/scraping/scrapValues.php | 24 ++++++--- 10 files changed, 163 insertions(+), 29 deletions(-) delete mode 100644 src/index.php diff --git a/.env.example b/.env.example index 64e8e6a..d2e0057 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,7 @@ MYSQL_HOST=localhost MYSQL_PORT=3306 MYSQL_USER=root -MYSQL_PASSWORD=senhaDoDB \ No newline at end of file +MYSQL_PASSWORD=senhaDoDB + +#caso voce não esteja usando linux, modifique o valor abaixo para a localização do seu chrome +CHROME_PATH=google-chrome \ No newline at end of file diff --git a/PROJETO.md b/PROJETO.md index 4824fd4..9c3434a 100644 --- a/PROJETO.md +++ b/PROJETO.md @@ -1,3 +1,101 @@ + +# Web Scraping + +Projeto feito para o processo seletivo da empresa Agilize "https://agilize.com.br/". +projeto feito em PHP ,Mysql e um pouco de javaScript. + + + + +## REQUISITOS PARA RODAR O PROJETO LOCALMENTE + +**Chrome** (necessário para o a biblioteca chrome-php) +``` +https://www.google.com/intl/pt-BR/chrome/ +``` + +**PHP v8.0.0^** +``` +https://www.php.net/downloads +``` + +**Composer v2.0.0^** +``` +https://getcomposer.org/download/ +``` + +**Mysql** +``` +https://www.mysql.com/downloads/ +``` + +**XAMPP** (windows) +``` +https://www.apachefriends.org/pt_br/index.html +``` + +**Apache** (linux) +``` +https://www.layerstack.com/resources/tutorials/Installing-Apache-server-on-Linux-Cloud-Servers +``` + +Dependendo da distribuição linux e da intalação do php sera necessario a instalação de alguns pacotes do PHP +``` sudo apt-get install php-xml sudo apt-get install php-mbstring sudo apt-get install -y php-mysqli + +``` + + + + +## Como iniciar o projeto + +Primeiramente va até onde esta armazenado o seu projeto, e modifique o arquivo **.env.example** +para os seus respectivos dados do Mysql. Assim que terminar renomeie o arquivo para **.env**. +Este arquivo é onde fica os dados sensiveis, que não devem ser compartilhados. + + +### Windows +Abra o aplicativo XAMPP, e inicie o serviço apache, grave a porta pois ela que voce vai usar para +acessar o servidor local + +### Linux +Acesse o repositorio do projeto, e insira os seguintes comandos no terminal: +``` + composer update + php -S localhost:8000 +``` +a porta do php pode ser modificada, mas a padrão linux é a 8000. + + +# API +Acesse as Urls abaixo para executar a criação, inserção e retorno dos dados. +Elas podem ser acessadas pelo navegador, ou aplicativos de simulação como o PostMan. +Como exemplo irei usar a porta 8000, porem use a porta em que voce inicializou o localhost, +ou a porta dada pelo XAMPP. + +## Criando e populando o banco de dados + +### Criar o banco de dados +``` +http://localhost:8000/DB/create +``` + +### Inserir os dados no Mysql +``` +http://localhost:8000/DB/populate +``` + + + +## Retorno da api + +``` +http://localhost:8000/api/dados +``` + + + + diff --git a/index.php b/index.php index 18b8eba..208c34d 100644 --- a/index.php +++ b/index.php @@ -1,4 +1,5 @@ connect_error) { die("Connection failed: " . $conn->connect_error); } return $conn; - } \ No newline at end of file +} \ No newline at end of file diff --git a/src/dataBase/createDB.php b/src/dataBase/createDB.php index 3cdf34c..f75f601 100644 --- a/src/dataBase/createDB.php +++ b/src/dataBase/createDB.php @@ -1,7 +1,8 @@ multi_query($sql); - } \ No newline at end of file diff --git a/src/dataBase/getDbValues.php b/src/dataBase/getDbValues.php index a3e061d..2c9311e 100644 --- a/src/dataBase/getDbValues.php +++ b/src/dataBase/getDbValues.php @@ -1,5 +1,6 @@ -query($sql); $res = json_encode($result->fetch_all(MYSQLI_ASSOC)); diff --git a/src/index.php b/src/index.php deleted file mode 100644 index e69de29..0000000 diff --git a/src/routes/routes.php b/src/routes/routes.php index 2a3fce4..5e30271 100644 --- a/src/routes/routes.php +++ b/src/routes/routes.php @@ -10,20 +10,40 @@ $app->get('/', function () { echo 'home'; }); + $app->get('/api/dados', function () { global $conn; - $values = getMysqlValues($conn); - echo $values; + try { + $values = getMysqlValues($conn); + echo $values; + } catch (Exception $e) { + echo 'error'; + } }); + $app->get('/DB/create', function () { global $conn; - createDB($conn); - echo "DB created"; + try { + createDB($conn); + echo 'banco de dados criado'; + } catch (Exception $e) { + echo 'erro a criar a base de dados'; + } }); + $app->get('/DB/populate', function () { global $conn; - insertInDb($conn); - echo "DB populated"; + try { + $values = getMysqlValues($conn); + if ($values == '[]') { + insertInDb($conn); + echo 'populado com sucesso'; + } else { + echo 'banco de dados ja populado'; + } + } catch (Exception) { + echo "erro a popular a base de dados"; + } }); $app->run(); \ No newline at end of file diff --git a/src/scraping/filterValues.php b/src/scraping/filterValues.php index 791b1e4..32fe84a 100644 --- a/src/scraping/filterValues.php +++ b/src/scraping/filterValues.php @@ -8,6 +8,7 @@ $unpaidValue = []; function removeUselessValues($items) +// remove os "detalhar" dos dados { array_splice($items, 0, 8); for ($i = 0; $i < count($items); $i++) { @@ -31,6 +32,7 @@ function formatValues($data) function separatedValuesInVariables($items) { + // separa os valores em variaveis global $date, $agency, $entities, $committedValue, $liquidatedValue, $paidValue, $unpaidValue; $value = formatValues($items); diff --git a/src/scraping/scrapValues.php b/src/scraping/scrapValues.php index 16240ae..8b4609c 100644 --- a/src/scraping/scrapValues.php +++ b/src/scraping/scrapValues.php @@ -3,12 +3,18 @@ // O uso do HeadLess foi uma dica do Israel Brito, agradeco muito use HeadlessChromium\BrowserFactory; +$dotenv = Dotenv\Dotenv::createImmutable(__DIR__ . '/../../'); + require_once './vendor/autoload.php'; -function getScrapValues(){ +function getScrapValues() +{ + global $dotenv; + $dotenv->load(); + $chrome = $_ENV['CHROME_PATH']; $url = 'https://www.transparencia.gov.br/despesas/orgao?ordenarPor=orgaoSuperior&direcao=asc'; - $browserFactory = new BrowserFactory('google-chrome'); + $browserFactory = new BrowserFactory($chrome); $browser = $browserFactory->createBrowser(); try { @@ -18,12 +24,13 @@ function getScrapValues(){ sleep(1); - //abaixo um codigo em js que pega as tabelas e desestrutura elas - //até retornar um array apenas os textos contidos. + //abaixo um codigo em js que pega as tabelas e desestrutura elas + //até retornar um array apenas os textos contidos. $value = $page ->evaluate( - "textValues = []; for (let i = 0; i < 46; i++){let even = - document.querySelectorAll('.even'); + "textValues = []; + for (let i = 0; i < 46; i++){ + let even = document.querySelectorAll('.even'); let odd = document.querySelectorAll('.odd'); let firstValues = []; odd.forEach((i) => firstValues.push(i)) @@ -39,9 +46,10 @@ function getScrapValues(){ } let but = document.querySelectorAll('.paginate_button')[1]; but.click();} - textValues;") + textValues;" + ) ->getReturnValue(); - return $value; + return $value; } finally { // bye $browser->close(); From 10c68cd6d511019269dc62e879ecb0d6d0b5c1ee Mon Sep 17 00:00:00 2001 From: marcoswarmling Date: Fri, 18 Mar 2022 06:43:39 -0300 Subject: [PATCH 13/13] melhorando o PROJETO,md --- PROJETO.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/PROJETO.md b/PROJETO.md index 9c3434a..ab05f2b 100644 --- a/PROJETO.md +++ b/PROJETO.md @@ -1,44 +1,44 @@ # Web Scraping -Projeto feito para o processo seletivo da empresa Agilize "https://agilize.com.br/". -projeto feito em PHP ,Mysql e um pouco de javaScript. - - - +Projeto feito para o processo seletivo da empresa Agilize "https://agilize.com.br/". +projeto feito em PHP ,Mysql, e um pouco de javaScript. + + + ## REQUISITOS PARA RODAR O PROJETO LOCALMENTE - + **Chrome** (necessário para o a biblioteca chrome-php) ``` https://www.google.com/intl/pt-BR/chrome/ ``` - + **PHP v8.0.0^** ``` https://www.php.net/downloads ``` - + **Composer v2.0.0^** ``` https://getcomposer.org/download/ ``` - + **Mysql** ``` https://www.mysql.com/downloads/ ``` - + **XAMPP** (windows) ``` https://www.apachefriends.org/pt_br/index.html ``` - + **Apache** (linux) ``` https://www.layerstack.com/resources/tutorials/Installing-Apache-server-on-Linux-Cloud-Servers ``` - + Dependendo da distribuição linux e da intalação do php sera necessario a instalação de alguns pacotes do PHP ``` sudo apt-get install php-xml @@ -52,9 +52,9 @@ sudo apt-get install -y php-mysqli ## Como iniciar o projeto -Primeiramente va até onde esta armazenado o seu projeto, e modifique o arquivo **.env.example** -para os seus respectivos dados do Mysql. Assim que terminar renomeie o arquivo para **.env**. -Este arquivo é onde fica os dados sensiveis, que não devem ser compartilhados. +Primeiramente va até onde esta armazenado o seu projeto, e modifique o arquivo **.env.example** +para os seus respectivos dados do Mysql. Assim que terminar renomeie o arquivo para **.env**. +Este arquivo é onde fica os dados sensiveis, que não devem ser compartilhados. ### Windows