• R/O
  • SSH

Tags
Keine Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. 0bc36b41d1b3964a397b5e569e9999af5bdd776c
Größe 2,176 Bytes
Zeit 2021-10-01 17:38:14
Autor Lorenzo Isella
Log Message

A simple script to perform some basic text mining.

Content

rm(list=ls())

library(tidyverse)
library(stringr)
library(stringi)
library(openxlsx)
library(janitor)
library(tidytext)


source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")

df_ini <- read_excel("etsi_wax_melts.xlsx")


df <- df_ini %>%
    mutate(across(everything(), ~ remove_special_char(.x, " "))) %>%
    filter(complete.cases(.))


titles <- df %>%
    select(title)

descriptions <- df %>%
    select(description)

tidy_titles <- titles %>%
    unnest_tokens(word, title)

tidy_descriptions <- descriptions %>%
    unnest_tokens(word, description)


data(stop_words)

tidy_titles <- tidy_titles %>%
    anti_join(stop_words)

tidy_descriptions <- tidy_descriptions %>%
    anti_join(stop_words)



word_count_titles <- tidy_titles %>%
    count(word, sort = TRUE)  


word_count_descriptions <- tidy_descriptions %>%
    count(word, sort = TRUE) 

### work on titles

titles_bigrams <- titles %>%
    unnest_tokens(bigram, title, token = "ngrams", n = 2)

titles_bigrams_separated <- titles_bigrams %>%
    separate(bigram, c("word1", "word2"), sep = " ")

titles_bigrams_filtered <- titles_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)


titles_bigram_counts <- titles_bigrams_filtered %>% 
    count(word1, word2, sort = TRUE)





### work on descriptions

descriptions_bigrams <- descriptions %>%
    unnest_tokens(bigram, description, token = "ngrams", n = 2)

descriptions_bigrams_separated <- descriptions_bigrams %>%
    separate(bigram, c("word1", "word2"), sep = " ")

descriptions_bigrams_filtered <- descriptions_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)



descriptions_bigram_counts <- descriptions_bigrams_filtered %>% 
    count(word1, word2, sort = TRUE)



trigram_description <- descriptions %>%
    unnest_tokens(trigram, description, token = "ngrams", n = 3) %>%
    separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
    count(word1, word2, word3, sort = TRUE)







print("So far so good")