Files
extraccion/app.py
2024-09-13 18:58:20 +00:00

193 lines
7.3 KiB
Python

import psycopg2
from psycopg2 import pool
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import os
from flask import Flask, request, jsonify
from waitress import serve
import pandas as pd
from io import StringIO
# Crear un pool de conexiones global
connection_pool = psycopg2.pool.ThreadedConnectionPool(
minconn=1,
maxconn=10, # Define el tamaño máximo del pool
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
)
def get_db_connection():
"""Obtiene una conexión del pool"""
return connection_pool.getconn()
def release_db_connection(conn):
"""Libera una conexión y la devuelve al pool"""
connection_pool.putconn(conn)
# Set options for the Chromium browser
chrome_options = Options()
chrome_options.add_argument("--headless") # Optional: Run Chromium in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Specify the path to the Chromium driver
service = Service('/usr/bin/chromedriver')
driver = None
def insert_alumno_extraccion(datos_html: str, materias_html: str, username_integer: int, historial_html: str = 'error', materias_actuales_html: str = 'error'):
conn = get_db_connection() # Obtener una conexión del pool
try:
cur = conn.cursor()
insert_query = """
INSERT INTO public.alumno_extraccion ("Usuario_claveULSA", datos_html, materias_html, historial_html, materias_actuales_html, updated_at)
VALUES (%s, TRIM(%s), TRIM(%s), TRIM(%s)::JSONB, TRIM(%s), NOW())
ON CONFLICT ("Usuario_claveULSA") DO UPDATE
SET datos_html = EXCLUDED.datos_html,
materias_html = EXCLUDED.materias_html,
error_message = NULL,
registrado = DEFAULT,
historial_html = EXCLUDED.historial_html,
updated_at = NOW();
"""
cur.execute(insert_query, (username_integer, datos_html, materias_html, historial_html, materias_actuales_html))
conn.commit()
return cur.query.decode('utf-8')
except psycopg2.ProgrammingError as e:
print(f"Error de sintaxis: {e}")
except psycopg2.IntegrityError as e:
print(f"Error de integridad: {e}")
except Exception as e:
print(f"Error: {e}")
finally:
cur.close()
release_db_connection(conn) # Liberar la conexión
def update_alumno_extraccion_error(username_integer: int, error: str):
conn = get_db_connection() # Obtener una conexión del pool
try:
cur = conn.cursor()
update_query = """
INSERT INTO public.alumno_extraccion ("Usuario_claveULSA", error_message, updated_at) VALUES (%s, %s, NOW())
ON CONFLICT ("Usuario_claveULSA") DO UPDATE
SET error_message = EXCLUDED.error_message,
registrado = DEFAULT,
updated_at = NOW();
"""
cur.execute(update_query, (username_integer, error))
conn.commit()
print("Data updated successfully")
except psycopg2.ProgrammingError as e:
print(f"Error de sintaxis: {e}")
finally:
cur.close()
release_db_connection(conn) # Liberar la conexión
def se_puede_extraer():
conn = get_db_connection() # Obtener una conexión del pool
try:
with conn.cursor() as cursor:
query = """
SELECT 1
FROM alumno_extraccion_fecha
WHERE CURRENT_DATE BETWEEN fecha_inicio AND fecha_fin
ORDER BY CREATED_AT DESC
LIMIT 1;
"""
cursor.execute(query)
result = cursor.fetchone()
return result is not None
except psycopg2.Error as e:
print(f"Error en la base de datos: {e}")
except Exception as e:
print(f"Error general: {e}")
finally:
release_db_connection(conn) # Liberar la conexión
def extract(driver, username: str, password: str):
url_credentials = f'https://{username}:{password}@sgu.ulsa.edu.mx/psulsa/alumnos/consultainformacionalumnos/consultainformacion.aspx'
url = 'https://sgu.ulsa.edu.mx/psulsa/alumnos/consultainformacionalumnos/consultainformacion.aspx'
username_integer = int(username[2:])
try:
driver.get(url_credentials)
driver.get(url)
# si no existe el elemento, ctl00_contenedor_control
datos_html = driver.find_element(By.ID, 'ctl00_contenedor_control').get_attribute('innerHTML')
elemento = WebDriverWait(driver, os.getenv("WAIT_TIME")).until(
EC.presence_of_element_located((By.ID, 'ctl00_contenedor_HistorialAlumno1_lblBtnSeccionHAcademico'))
)
# Intentar varias veces en caso de un `StaleElementReferenceException`
for _ in range(3):
try:
elemento.click()
break # Si se hace clic correctamente, salir del bucle
except StaleElementReferenceException:
print("Elemento 'stale', intentando de nuevo...")
elemento = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_lblBtnSeccionHAcademico')
# Obtener el HTML de las materias
materias_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_divHAcademico').get_attribute('innerHTML')
historial_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_gvMaterias').get_attribute('innerHTML')
# Manejar el historial como DataFrame
historial_html_io = StringIO(f"<table>{historial_html}</table>")
df = pd.read_html(historial_html_io)[0]
json_result = None
if 'GRUPO' in df.columns:
df['PERIODO'] = df['PERIODO'].apply(lambda x: str(x).replace('.0', '') if isinstance(x, (float, int)) else x)
json_result = df[df['GRUPO'] != 'Promedio:'].to_json(orient='records')
query = insert_alumno_extraccion(datos_html, materias_html, username_integer, json_result)
print("Data extracted successfully")
return json_result
except Exception as e:
update_alumno_extraccion_error(username_integer, str(e))
app = Flask(__name__)
@app.route('/calificaciones', methods=['POST'])
def main():
try:
# Inicializa el WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
username = request.form.get('clave')
password = request.form.get('password')
se_puede = se_puede_extraer()
if se_puede:
query = extract(driver, username, password)
return jsonify({"message": "Data extracted successfully", "en-fecha": se_puede})
finally:
if driver is not None:
driver.quit() # Asegura que el driver se cierre
if __name__ == '__main__':
serve(app, host='0.0.0.0', port=5000)