Files
extraccion/app.py
2024-09-10 09:48:07 -06:00

189 lines
7.2 KiB
Python

import psycopg2
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import os
from flask import Flask, request, jsonify
from waitress import serve
import pandas as pd
from io import StringIO
# Set options for the Chromium browser
chrome_options = Options()
chrome_options.add_argument("--headless") # Optional: Run Chromium in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Specify the path to the Chromium driver
service = Service('/usr/bin/chromedriver')
driver = None
def insert_alumno_extraccion(datos_html: str, materias_html: str, historial_html: str = 'error', materias_actuales_html: str = 'error'):
try:
conn = psycopg2.connect(
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
)
cur = conn.cursor()
insert_query = """
INSERT INTO public.alumno_extraccion ("Usuario_claveULSA", datos_html, materias_html, historial_html, materias_actuales_html) VALUES (%s, TRIM(%s), TRIM(%s), TRIM(%s)::JSONB, TRIM(%s))
ON CONFLICT ("Usuario_claveULSA") DO UPDATE SET datos_html = EXCLUDED.datos_html, materias_html = EXCLUDED.materias_html, error_message = NULL, registrado = DEFAULT, historial_html = EXCLUDED.historial_html;
"""
cur.execute(insert_query, (username_integer, datos_html, materias_html, historial_html, materias_actuales_html))
conn.commit()
return cur.query.decode('utf-8')
except psycopg2.ProgrammingError as e:
print(f"Error de sintaxis: {e}")
except psycopg2.IntegrityError as e:
print(f"Error de integridad: {e}")
except Exception as e:
print(f"Error: {e}")
finally:
cur.close()
conn.close()
def update_alumno_extraccion_error(error: str):
try:
conn = psycopg2.connect(
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
)
cur = conn.cursor()
update_query = """
INSERT INTO public.alumno_extraccion ("Usuario_claveULSA", error_message) VALUES (%s, %s)
ON CONFLICT ("Usuario_claveULSA") DO UPDATE SET error_message = EXCLUDED.error_message,
materias_html = DEFAULT, registrado = DEFAULT;
"""
cur.execute(update_query, (username_integer, error[:255]))
conn.commit()
print("Data updated successfully")
except psycopg2.ProgrammingError as e:
print(f"Error de sintaxis: {e}")
finally:
cur.close()
conn.close()
def extract(username: str, password: str):
url_credentials = f'https://{username}:{password}@sgu.ulsa.edu.mx/psulsa/alumnos/consultainformacionalumnos/consultainformacion.aspx'
url = 'https://sgu.ulsa.edu.mx/psulsa/alumnos/consultainformacionalumnos/consultainformacion.aspx'
username_integer = int(username[2:])
try:
driver.get(url_credentials)
driver.get(url)
# si no existe el elemento, ctl00_contenedor_control
datos_html = driver.find_element(By.ID, 'ctl00_contenedor_control').get_attribute('innerHTML')
elemento = WebDriverWait(driver, 3.5).until(
EC.presence_of_element_located((By.ID, 'ctl00_contenedor_HistorialAlumno1_lblBtnSeccionHAcademico'))
)
# Intentar varias veces en caso de un `StaleElementReferenceException`
for _ in range(3):
try:
elemento.click()
break # Si se hace clic correctamente, salir del bucle
except StaleElementReferenceException:
print("Elemento 'stale', intentando de nuevo...")
elemento = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_lblBtnSeccionHAcademico')
# Obtener el HTML de las materias
materias_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_divHAcademico').get_attribute('innerHTML')
historial_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_gvMaterias').get_attribute('innerHTML')
# Manejar el historial como DataFrame
historial_html_io = StringIO(f"<table>{historial_html}</table>")
df = pd.read_html(historial_html_io)[0]
if 'GRUPO' not in df.columns:
raise KeyError("Column 'GRUPO' not found in the DataFrame")
df['PERIODO'] = df['PERIODO'].apply(lambda x: str(x).replace('.0', '') if isinstance(x, (float, int)) else x)
json_result = df[df['GRUPO'] != 'Promedio:'].to_json(orient='records')
query = insert_alumno_extraccion(datos_html, materias_html, json_result)
print("Data extracted successfully")
return json_result
except NoSuchElementException as e:
update_alumno_extraccion_error(str(e))
def se_puede():
try:
# Establece la conexión a la base de datos usando with para gestionar automáticamente el cierre
with psycopg2.connect(
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
) as conn:
with conn.cursor() as cursor:
# SELECCIONAR ÚLTIMA planeacion
query = """
SELECT 1
FROM alumno_extraccion_fecha
WHERE CURRENT_DATE BETWEEN fecha_inicio AND fecha_fin
ORDER BY CREATED_AT DESC
LIMIT 1;
"""
# Ejecuta la consulta
cursor.execute(query)
result = cursor.fetchone()
# Verifica si se obtuvo algún resultado
return result is not None
except psycopg2.Error as e:
# Maneja errores específicos de la base de datos
print(f"Error en la base de datos: {e}")
except Exception as e:
# Maneja otros tipos de errores
print(f"Error general: {e}")
return False
app = Flask(__name__)
@app.route('/calificaciones', methods=['POST'])
def main():
global driver
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
username = request.form.get('clave')
password = request.form.get('password')
se_puede = se_puede_extraer()
if se_puede:
query = extract(username, password)
# Close the session
driver.quit()
return jsonify({"message": "Data extracted successfully", "en-fecha": se_puede})
if __name__ == '__main__':
serve(app, host='0.0.0.0', port=5000)