Skip to content

Commit

Permalink
fix id field and update changelog (#125)
Browse files Browse the repository at this point in the history
  • Loading branch information
longshuicy authored Aug 22, 2024
1 parent 7a36a1b commit 0eda1fa
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 32 deletions.
4 changes: 4 additions & 0 deletions containerized_analytics/smile/preprocessing/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.6] - 08-22-2024

### Fixed
- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121)

## [0.1.5] - 01-23-2024

Expand Down
47 changes: 15 additions & 32 deletions containerized_analytics/smile/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,21 @@ class Preprocess:

def __init__(self, df, column):

self.id_column = "id"
if 'id_str' in df.columns:
self.id_column = 'id_str'
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
sentences = df_new[column].astype('str').tolist()
self.id = df_new[self.id_column].astype('str').tolist()
elif 'id' in df.columns:
self.id_column = 'id'
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
sentences = df_new[column].astype('str').tolist()
self.id = df_new[self.id_column].astype('str').tolist()
elif 'comment_id' in df.columns:
self.id_column = 'comment_id'
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
sentences = df_new[column].astype('str').tolist()
self.id = df_new[self.id_column].astype('str').tolist()
elif '_source.id_str':
self.id_column = '_source.id_str'
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
sentences = df_new[column].astype('str').tolist()
self.id = df_new[self.id_column].astype('str').tolist()
elif '_source.id':
self.id_column = '_source.id_str'
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
sentences = df_new[column].astype('str').tolist()
self.id = df_new[self.id_column].astype('str').tolist()
else:
sentences = df[df[column] != ''][column].dropna().astype(
'str').tolist()
self.id = []

sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences]
# Define potential id columns in order of precedence
potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id']

# Find the first available id column from the potential list
self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index')

# If using index as the id_column, create a new column based on the index
if self.id_column == 'index':
df[self.id_column] = df.index.astype('str')

# Filter the dataframe based on the column condition
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()]

self.id = df_new[self.id_column].astype('str').tolist()
self.sentences = sentences

def get_phrases(self):
Expand Down

0 comments on commit 0eda1fa

Please sign in to comment.