Skip to content

Commit

Permalink
Prevent duplication of statistics metadata (home-assistant#71637)
Browse files Browse the repository at this point in the history
* Prevent duplication of statistics metadata

* Add models_schema_28.py

* Handle entity renaming as a recorder job

* Improve tests
  • Loading branch information
emontnemery authored May 24, 2022
1 parent d620072 commit 23bd64b
Show file tree
Hide file tree
Showing 8 changed files with 1,175 additions and 32 deletions.
13 changes: 11 additions & 2 deletions homeassistant/components/recorder/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
async_track_time_interval,
async_track_utc_time_change,
)
from homeassistant.helpers.typing import UNDEFINED, UndefinedType
import homeassistant.util.dt as dt_util

from . import migration, statistics
Expand Down Expand Up @@ -461,10 +462,18 @@ def async_clear_statistics(self, statistic_ids: list[str]) -> None:

@callback
def async_update_statistics_metadata(
self, statistic_id: str, unit_of_measurement: str | None
self,
statistic_id: str,
*,
new_statistic_id: str | UndefinedType = UNDEFINED,
new_unit_of_measurement: str | None | UndefinedType = UNDEFINED,
) -> None:
"""Update statistics metadata for a statistic_id."""
self.queue_task(UpdateStatisticsMetadataTask(statistic_id, unit_of_measurement))
self.queue_task(
UpdateStatisticsMetadataTask(
statistic_id, new_statistic_id, new_unit_of_measurement
)
)

@callback
def async_external_statistics(
Expand Down
23 changes: 21 additions & 2 deletions homeassistant/components/recorder/migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@
StatisticsShortTerm,
process_timestamp,
)
from .statistics import delete_duplicates, get_start_time
from .statistics import (
delete_statistics_duplicates,
delete_statistics_meta_duplicates,
get_start_time,
)
from .util import session_scope

_LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -670,7 +674,7 @@ def _apply_update( # noqa: C901
# There may be duplicated statistics entries, delete duplicated statistics
# and try again
with session_scope(session=session_maker()) as session:
delete_duplicates(hass, session)
delete_statistics_duplicates(hass, session)
_create_index(
session_maker, "statistics", "ix_statistics_statistic_id_start"
)
Expand Down Expand Up @@ -705,6 +709,21 @@ def _apply_update( # noqa: C901
_create_index(session_maker, "states", "ix_states_context_id")
# Once there are no longer any state_changed events
# in the events table we can drop the index on states.event_id
elif new_version == 29:
# Recreate statistics_meta index to block duplicated statistic_id
_drop_index(session_maker, "statistics_meta", "ix_statistics_meta_statistic_id")
try:
_create_index(
session_maker, "statistics_meta", "ix_statistics_meta_statistic_id"
)
except DatabaseError:
# There may be duplicated statistics_meta entries, delete duplicates
# and try again
with session_scope(session=session_maker()) as session:
delete_statistics_meta_duplicates(session)
_create_index(
session_maker, "statistics_meta", "ix_statistics_meta_statistic_id"
)
else:
raise ValueError(f"No schema migration defined for version {new_version}")

Expand Down
4 changes: 2 additions & 2 deletions homeassistant/components/recorder/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
# pylint: disable=invalid-name
Base = declarative_base()

SCHEMA_VERSION = 28
SCHEMA_VERSION = 29

_LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -515,7 +515,7 @@ class StatisticsMeta(Base): # type: ignore[misc,valid-type]
)
__tablename__ = TABLE_STATISTICS_META
id = Column(Integer, Identity(), primary_key=True)
statistic_id = Column(String(255), index=True)
statistic_id = Column(String(255), index=True, unique=True)
source = Column(String(32))
unit_of_measurement = Column(String(255))
has_mean = Column(Boolean)
Expand Down
106 changes: 88 additions & 18 deletions homeassistant/components/recorder/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from homeassistant.helpers import entity_registry
from homeassistant.helpers.json import JSONEncoder
from homeassistant.helpers.storage import STORAGE_DIR
from homeassistant.helpers.typing import UNDEFINED, UndefinedType
import homeassistant.util.dt as dt_util
import homeassistant.util.pressure as pressure_util
import homeassistant.util.temperature as temperature_util
Expand Down Expand Up @@ -208,18 +209,11 @@ def as_dict(self) -> dict:
def async_setup(hass: HomeAssistant) -> None:
"""Set up the history hooks."""

def _entity_id_changed(event: Event) -> None:
"""Handle entity_id changed."""
old_entity_id = event.data["old_entity_id"]
entity_id = event.data["entity_id"]
with session_scope(hass=hass) as session:
session.query(StatisticsMeta).filter(
(StatisticsMeta.statistic_id == old_entity_id)
& (StatisticsMeta.source == DOMAIN)
).update({StatisticsMeta.statistic_id: entity_id})

async def _async_entity_id_changed(event: Event) -> None:
await hass.data[DATA_INSTANCE].async_add_executor_job(_entity_id_changed, event)
@callback
def _async_entity_id_changed(event: Event) -> None:
hass.data[DATA_INSTANCE].async_update_statistics_metadata(
event.data["old_entity_id"], new_statistic_id=event.data["entity_id"]
)

@callback
def entity_registry_changed_filter(event: Event) -> bool:
Expand Down Expand Up @@ -380,7 +374,7 @@ def _delete_duplicates_from_table(
return (total_deleted_rows, all_non_identical_duplicates)


def delete_duplicates(hass: HomeAssistant, session: Session) -> None:
def delete_statistics_duplicates(hass: HomeAssistant, session: Session) -> None:
"""Identify and delete duplicated statistics.
A backup will be made of duplicated statistics before it is deleted.
Expand Down Expand Up @@ -423,6 +417,69 @@ def delete_duplicates(hass: HomeAssistant, session: Session) -> None:
)


def _find_statistics_meta_duplicates(session: Session) -> list[int]:
"""Find duplicated statistics_meta."""
subquery = (
session.query(
StatisticsMeta.statistic_id,
literal_column("1").label("is_duplicate"),
)
.group_by(StatisticsMeta.statistic_id)
.having(func.count() > 1)
.subquery()
)
query = (
session.query(StatisticsMeta)
.outerjoin(
subquery,
(subquery.c.statistic_id == StatisticsMeta.statistic_id),
)
.filter(subquery.c.is_duplicate == 1)
.order_by(StatisticsMeta.statistic_id, StatisticsMeta.id.desc())
.limit(1000 * MAX_ROWS_TO_PURGE)
)
duplicates = execute(query)
statistic_id = None
duplicate_ids: list[int] = []

if not duplicates:
return duplicate_ids

for duplicate in duplicates:
if statistic_id != duplicate.statistic_id:
statistic_id = duplicate.statistic_id
continue
duplicate_ids.append(duplicate.id)

return duplicate_ids


def _delete_statistics_meta_duplicates(session: Session) -> int:
"""Identify and delete duplicated statistics from a specified table."""
total_deleted_rows = 0
while True:
duplicate_ids = _find_statistics_meta_duplicates(session)
if not duplicate_ids:
break
for i in range(0, len(duplicate_ids), MAX_ROWS_TO_PURGE):
deleted_rows = (
session.query(StatisticsMeta)
.filter(StatisticsMeta.id.in_(duplicate_ids[i : i + MAX_ROWS_TO_PURGE]))
.delete(synchronize_session=False)
)
total_deleted_rows += deleted_rows
return total_deleted_rows


def delete_statistics_meta_duplicates(session: Session) -> None:
"""Identify and delete duplicated statistics_meta."""
deleted_statistics_rows = _delete_statistics_meta_duplicates(session)
if deleted_statistics_rows:
_LOGGER.info(
"Deleted %s duplicated statistics_meta rows", deleted_statistics_rows
)


def _compile_hourly_statistics_summary_mean_stmt(
start_time: datetime, end_time: datetime
) -> StatementLambdaElement:
Expand Down Expand Up @@ -736,13 +793,26 @@ def clear_statistics(instance: Recorder, statistic_ids: list[str]) -> None:


def update_statistics_metadata(
instance: Recorder, statistic_id: str, unit_of_measurement: str | None
instance: Recorder,
statistic_id: str,
new_statistic_id: str | None | UndefinedType,
new_unit_of_measurement: str | None | UndefinedType,
) -> None:
"""Update statistics metadata for a statistic_id."""
with session_scope(session=instance.get_session()) as session:
session.query(StatisticsMeta).filter(
StatisticsMeta.statistic_id == statistic_id
).update({StatisticsMeta.unit_of_measurement: unit_of_measurement})
if new_unit_of_measurement is not UNDEFINED:
with session_scope(session=instance.get_session()) as session:
session.query(StatisticsMeta).filter(
StatisticsMeta.statistic_id == statistic_id
).update({StatisticsMeta.unit_of_measurement: new_unit_of_measurement})
if new_statistic_id is not UNDEFINED:
with session_scope(
session=instance.get_session(),
exception_filter=_filter_unique_constraint_integrity_error(instance),
) as session:
session.query(StatisticsMeta).filter(
(StatisticsMeta.statistic_id == statistic_id)
& (StatisticsMeta.source == DOMAIN)
).update({StatisticsMeta.statistic_id: new_statistic_id})


def list_statistic_ids(
Expand Down
9 changes: 7 additions & 2 deletions homeassistant/components/recorder/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import TYPE_CHECKING, Any

from homeassistant.core import Event
from homeassistant.helpers.typing import UndefinedType

from . import purge, statistics
from .const import DOMAIN, EXCLUDE_ATTRIBUTES
Expand Down Expand Up @@ -46,12 +47,16 @@ class UpdateStatisticsMetadataTask(RecorderTask):
"""Object to store statistics_id and unit for update of statistics metadata."""

statistic_id: str
unit_of_measurement: str | None
new_statistic_id: str | None | UndefinedType
new_unit_of_measurement: str | None | UndefinedType

def run(self, instance: Recorder) -> None:
"""Handle the task."""
statistics.update_statistics_metadata(
instance, self.statistic_id, self.unit_of_measurement
instance,
self.statistic_id,
self.new_statistic_id,
self.new_unit_of_measurement,
)


Expand Down
2 changes: 1 addition & 1 deletion homeassistant/components/recorder/websocket_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def ws_update_statistics_metadata(
) -> None:
"""Update statistics metadata for a statistic_id."""
hass.data[DATA_INSTANCE].async_update_statistics_metadata(
msg["statistic_id"], msg["unit_of_measurement"]
msg["statistic_id"], new_unit_of_measurement=msg["unit_of_measurement"]
)
connection.send_result(msg["id"])

Expand Down
Loading

0 comments on commit 23bd64b

Please sign in to comment.