Files
extraccion/app.py

190 lines
7.0 KiB
Python

import psycopg2
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from flask import Flask, request, jsonify
from waitress import serve
import pandas as pd
from io import StringIO
# Set options for the Chromium browser
chrome_options = Options()
chrome_options.add_argument("--headless") # Optional: Run Chromium in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Specify the path to the Chromium driver
service = Service('/usr/bin/chromedriver')
driver = None
def extract(username: str, password: str):
url_credentials = f'https://{username}:{password}@sgu.ulsa.edu.mx/psulsa/alumnos/consultainformacionalumnos/consultainformacion.aspx'
url = 'https://sgu.ulsa.edu.mx/psulsa/alumnos/consultainformacionalumnos/consultainformacion.aspx'
username_integer = int(username[2:])
def insert_alumno_extraccion(datos_html: str, materias_html: str, historial_html: str = 'error', materias_actuales_html: str = 'error'):
try:
conn = psycopg2.connect(
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
)
cur = conn.cursor()
insert_query = """
INSERT INTO public.alumno_extraccion ("Usuario_claveULSA", datos_html, materias_html, historial_html, materias_actuales_html) VALUES (%s, TRIM(%s), TRIM(%s), TRIM(%s)::JSONB, TRIM(%s))
ON CONFLICT ("Usuario_claveULSA") DO UPDATE SET datos_html = EXCLUDED.datos_html, materias_html = EXCLUDED.materias_html, error_message = NULL, registrado = DEFAULT, historial_html = EXCLUDED.historial_html;
"""
cur.execute(insert_query, (username_integer, datos_html, materias_html, historial_html, materias_actuales_html))
conn.commit()
return cur.query.decode('utf-8')
except psycopg2.ProgrammingError as e:
print(f"Error de sintaxis: {e}")
except psycopg2.IntegrityError as e:
print(f"Error de integridad: {e}")
except Exception as e:
print(f"Error: {e}")
finally:
cur.close()
conn.close()
def update_alumno_extraccion_error(error: str):
try:
conn = psycopg2.connect(
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
)
cur = conn.cursor()
update_query = """
INSERT INTO public.alumno_extraccion ("Usuario_claveULSA", error_message) VALUES (%s, %s)
ON CONFLICT ("Usuario_claveULSA") DO UPDATE SET error_message = EXCLUDED.error_message,
materias_html = DEFAULT, registrado = DEFAULT;
"""
cur.execute(update_query, (username_integer, error[:255]))
conn.commit()
print("Data updated successfully")
except psycopg2.ProgrammingError as e:
print(f"Error de sintaxis: {e}")
finally:
cur.close()
conn.close()
try:
driver.get(url_credentials)
driver.get(url)
# si no existe el elemento, ctl00_contenedor_control
datos_html = driver.find_element(By.ID, 'ctl00_contenedor_control').get_attribute('innerHTML')
elemento = WebDriverWait(driver, 3.5).until(
EC.presence_of_element_located((By.ID, 'ctl00_contenedor_HistorialAlumno1_lblBtnSeccionHAcademico'))
)
elemento.click()
# Get the HTML content of the materias element
materias_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_divHAcademico').get_attribute('innerHTML')
historial_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_gvMaterias').get_attribute('innerHTML')
# materias_actuales_html = driver.find_element(By.ID, 'ctl00_contenedor_HistorialAlumno1_div13').get_attribute('innerHTML')
historial_html_io = StringIO(f"<table>{historial_html}</table>")
# Read the HTML table into a DataFrame
df = pd.read_html(historial_html_io)[0]
if 'GRUPO' not in df.columns:
raise KeyError("Column 'GRUPO' not found in the DataFrame")
# Remove decimals from the 'PERIODO' column (if present)
df['PERIODO'] = df['PERIODO'].apply(lambda x: str(x).replace('.0', '') if isinstance(x, (float, int)) else x)
# Convert the DataFrame to JSON
json_result = df[df['GRUPO'] != 'Promedio:'].to_json(orient='records')
# Connect to PostgreSQL database
query = insert_alumno_extraccion(datos_html, materias_html, json_result)
print("Data extracted successfully")
return json_result
except NoSuchElementException as e:
update_alumno_extraccion_error(str(e))
def se_puede_extraer():
try:
conn = conn = psycopg2.connect(
dbname=os.getenv("DBNAME"),
user=os.getenv("DBUSER"),
password=os.getenv("DBPASSWORD"),
host=os.getenv("DBHOST"),
port=os.getenv("DBPORT")
)
cursor = conn.cursor()
# SELECCIONAR ULTIMA planeacion
query = """
SELECT 1
FROM alumno_extraccion_fecha
WHERE CURRENT_DATE BETWEEN fecha_inicio AND fecha_fin
ORDER BY CREATED_AT DESC
LIMIT 1;
"""
# Ejecuta la consulta
cursor.execute(query)
result = cursor.fetchone()
# Verifica si se obtuvo algún resultado
exists = result is not None
# Cierra el cursor y la conexión
cursor.close()
conn.close()
return exists
except Exception as e:
print(f"Error: {e}")
return False
app = Flask(__name__)
@app.route('/calificaciones', methods=['POST'])
def main():
global driver
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
username = request.form.get('clave')
password = request.form.get('password')
se_puede = se_puede_extraer()
if se_puede:
query = extract(username, password)
# Close the session
driver.quit()
return jsonify({"message": "Data extracted successfully", "en-fecha": se_puede})
if __name__ == '__main__':
serve(app, host='0.0.0.0', port=5000)